0

Nick: fixes

This commit is contained in:
Nicolas 2024-05-15 11:28:20 -07:00
parent 87570bdfa1
commit d10f81e7fe
2 changed files with 7 additions and 5 deletions

View File

@ -71,8 +71,8 @@ export class WebScraperDataProvider {
const batchUrls = urls.slice(i, i + this.concurrentRequests); const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all( await Promise.all(
batchUrls.map(async (url, index) => { batchUrls.map(async (url, index) => {
const existingText = allHtmls ? allHtmls[i + index] : ""; const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(url, this.pageOptions, existingText); const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
inProgress({ inProgress({

View File

@ -107,7 +107,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
existingText: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -199,8 +199,10 @@ export async function scrapSingleUrl(
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it // If exists text coming from crawler, use it
if (existingText && existingText.trim().length >= 100) { if (existingHtml && existingHtml.trim().length >= 100) {
text = existingText; let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = existingHtml;
break; break;
} }
[text, html] = await attemptScraping(urlToScrap, scraper); [text, html] = await attemptScraping(urlToScrap, scraper);