0

Merge pull request #139 from mendableai/nsc/refactor-scraping-order

Nsc/refactor scraping order
This commit is contained in:
Nicolas 2024-05-09 17:57:01 -07:00 committed by GitHub
commit f94b6053ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 76 additions and 39 deletions

View File

@ -21,7 +21,7 @@ export async function generateRequestParams(
};
try {
const urlKey = new URL(url).hostname;
const urlKey = new URL(url).hostname.replace(/^www\./, "");
if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else {
@ -57,7 +57,7 @@ export async function scrapWithScrapingBee(
wait_browser,
timeout
);
const response = await client.get(clientParams);
if (response.status !== 200 && response.status !== 404) {
@ -77,12 +77,15 @@ export async function scrapWithScrapingBee(
export async function scrapWithPlaywright(url: string): Promise<string> {
try {
const reqParams = await generateRequestParams(url);
const wait_playwright = reqParams["params"]?.wait ?? 0;
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ url: url }),
body: JSON.stringify({ url: url, wait: wait_playwright }),
});
if (!response.ok) {
@ -103,7 +106,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
export async function scrapSingleUrl(
urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@ -169,56 +172,50 @@ export async function scrapSingleUrl(
break;
}
//* TODO: add an optional to return markdown or structured/extracted content
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text];
};
try {
// TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo
// let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper');
// if (!text || text.length < 100) {
// console.log("Falling back to scraping bee load");
// [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad');
// }
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
// Basically means that it is using /search endpoint
if (pageOptions.fallback === false) {
const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap);
return {
url: urlToScrap,
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap },
} as Document;
let [text, html] = ["", ""];
let urlKey = urlToScrap;
try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
} catch (error) {
console.error(`Invalid URL key, trying: ${urlToScrap}`);
}
if (!text || text.length < 100) {
console.log("Falling back to playwright");
[text, html] = await attemptScraping(urlToScrap, "playwright");
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = defaultScraper
? [
defaultScraper,
"scrapingBee",
"playwright",
"scrapingBeeLoad",
"fetch",
]
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
for (const scraper of scrapersInOrder) {
[text, html] = await attemptScraping(urlToScrap, scraper);
if (text && text.length >= 100) break;
console.log(`Falling back to ${scraper}`);
}
if (!text || text.length < 100) {
console.log("Falling back to scraping bee load");
[text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad");
}
if (!text || text.length < 100) {
console.log("Falling back to fetch");
[text, html] = await attemptScraping(urlToScrap, "fetch");
if (!text) {
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
}
const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap);
return {
const document: Document = {
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap },
} as Document;
};
return document;
} catch (error) {
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
return {

View File

@ -38,5 +38,45 @@ export const urlSpecificParams = {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"docs.pdw.co":{
defaultScraper: "playwright",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 3000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"ycombinator.com":{
defaultScraper: "playwright",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 3000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
}
};