Merge pull request #139 from mendableai/nsc/refactor-scraping-order
Nsc/refactor scraping order
This commit is contained in:
commit
f94b6053ad
@ -21,7 +21,7 @@ export async function generateRequestParams(
|
||||
};
|
||||
|
||||
try {
|
||||
const urlKey = new URL(url).hostname;
|
||||
const urlKey = new URL(url).hostname.replace(/^www\./, "");
|
||||
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
||||
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
||||
} else {
|
||||
@ -57,7 +57,7 @@ export async function scrapWithScrapingBee(
|
||||
wait_browser,
|
||||
timeout
|
||||
);
|
||||
|
||||
|
||||
const response = await client.get(clientParams);
|
||||
|
||||
if (response.status !== 200 && response.status !== 404) {
|
||||
@ -77,12 +77,15 @@ export async function scrapWithScrapingBee(
|
||||
|
||||
export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
const wait_playwright = reqParams["params"]?.wait ?? 0;
|
||||
|
||||
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ url: url }),
|
||||
body: JSON.stringify({ url: url, wait: wait_playwright }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
@ -103,7 +106,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
@ -169,56 +172,50 @@ export async function scrapSingleUrl(
|
||||
break;
|
||||
}
|
||||
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||
|
||||
|
||||
return [await parseMarkdown(cleanedHtml), text];
|
||||
};
|
||||
|
||||
try {
|
||||
// TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo
|
||||
// let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper');
|
||||
// if (!text || text.length < 100) {
|
||||
// console.log("Falling back to scraping bee load");
|
||||
// [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad');
|
||||
// }
|
||||
|
||||
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
||||
// Basically means that it is using /search endpoint
|
||||
if (pageOptions.fallback === false) {
|
||||
const soup = cheerio.load(html);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
return {
|
||||
url: urlToScrap,
|
||||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||
} as Document;
|
||||
let [text, html] = ["", ""];
|
||||
let urlKey = urlToScrap;
|
||||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
} catch (error) {
|
||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
}
|
||||
if (!text || text.length < 100) {
|
||||
console.log("Falling back to playwright");
|
||||
[text, html] = await attemptScraping(urlToScrap, "playwright");
|
||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||
const scrapersInOrder = defaultScraper
|
||||
? [
|
||||
defaultScraper,
|
||||
"scrapingBee",
|
||||
"playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
]
|
||||
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
[text, html] = await attemptScraping(urlToScrap, scraper);
|
||||
if (text && text.length >= 100) break;
|
||||
console.log(`Falling back to ${scraper}`);
|
||||
}
|
||||
|
||||
if (!text || text.length < 100) {
|
||||
console.log("Falling back to scraping bee load");
|
||||
[text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad");
|
||||
}
|
||||
if (!text || text.length < 100) {
|
||||
console.log("Falling back to fetch");
|
||||
[text, html] = await attemptScraping(urlToScrap, "fetch");
|
||||
if (!text) {
|
||||
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
||||
}
|
||||
|
||||
const soup = cheerio.load(html);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
|
||||
return {
|
||||
const document: Document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||
} as Document;
|
||||
};
|
||||
|
||||
return document;
|
||||
} catch (error) {
|
||||
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
||||
return {
|
||||
|
@ -38,5 +38,45 @@ export const urlSpecificParams = {
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"docs.pdw.co":{
|
||||
defaultScraper: "playwright",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 3000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"ycombinator.com":{
|
||||
defaultScraper: "playwright",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 3000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user