Nick: fixes
This commit is contained in:
parent
87570bdfa1
commit
d10f81e7fe
@ -71,8 +71,8 @@ export class WebScraperDataProvider {
|
|||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const existingText = allHtmls ? allHtmls[i + index] : "";
|
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||||
const result = await scrapSingleUrl(url, this.pageOptions, existingText);
|
const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
|
@ -107,7 +107,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
|||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
||||||
existingText: string = ""
|
existingHtml: string = ""
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
@ -199,8 +199,10 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
for (const scraper of scrapersInOrder) {
|
||||||
// If exists text coming from crawler, use it
|
// If exists text coming from crawler, use it
|
||||||
if (existingText && existingText.trim().length >= 100) {
|
if (existingHtml && existingHtml.trim().length >= 100) {
|
||||||
text = existingText;
|
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
||||||
|
text = await parseMarkdown(cleanedHtml);
|
||||||
|
html = existingHtml;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
[text, html] = await attemptScraping(urlToScrap, scraper);
|
[text, html] = await attemptScraping(urlToScrap, scraper);
|
||||||
|
Loading…
Reference in New Issue
Block a user