0

Nick: fixes

This commit is contained in:
Nicolas 2024-05-15 11:28:20 -07:00
parent 87570bdfa1
commit d10f81e7fe
2 changed files with 7 additions and 5 deletions

View File

@ -71,8 +71,8 @@ export class WebScraperDataProvider {
const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all(
batchUrls.map(async (url, index) => {
const existingText = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(url, this.pageOptions, existingText);
const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
processedUrls++;
if (inProgress) {
inProgress({

View File

@ -107,7 +107,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
export async function scrapSingleUrl(
urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
existingText: string = ""
existingHtml: string = ""
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@ -199,8 +199,10 @@ export async function scrapSingleUrl(
for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it
if (existingText && existingText.trim().length >= 100) {
text = existingText;
if (existingHtml && existingHtml.trim().length >= 100) {
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = existingHtml;
break;
}
[text, html] = await attemptScraping(urlToScrap, scraper);