From 86b8439844b97c7a35b679500a19e3aa24cf9178 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 20:51:42 -0700 Subject: [PATCH 1/3] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 5 +++-- apps/api/src/scraper/WebScraper/index.ts | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0248df2..ee29069 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,7 +4,7 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; import { Progress } from "../../lib/entities"; -import { scrapWithScrapingBee } from "./single_url"; +import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; export class WebCrawler { @@ -196,7 +196,8 @@ export class WebCrawler { let content; // If it is the first link, fetch with scrapingbee if (this.visited.size === 1) { - content = await scrapWithScrapingBee(url, "load"); + const page = await scrapSingleUrl(url, {includeHtml: true}); + content = page.html; } else { const response = await axios.get(url); content = response.data; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7ef0a10..80f2d86 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -140,6 +140,7 @@ export class WebScraperDataProvider { generateImgAltText: this.generateImgAltText, }); let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); + if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } @@ -163,6 +164,7 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } + let documents = await this.processLinks(links, inProgress); return this.cacheAndFinalizeDocuments(documents, links); } @@ -237,6 +239,8 @@ export class WebScraperDataProvider { links: string[] ): Promise { await this.setCachedDocuments(documents, links); + documents = this.filterDocsExcludeInclude(documents); + documents = this.filterDepth(documents); documents = this.removeChildLinks(documents); return documents.splice(0, this.limit); } From 8101cbee37f6a54ce9b343f7081717874a323fe1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 21:02:47 -0700 Subject: [PATCH 2/3] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 80f2d86..a3ea6b6 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -240,7 +240,6 @@ export class WebScraperDataProvider { ): Promise { await this.setCachedDocuments(documents, links); documents = this.filterDocsExcludeInclude(documents); - documents = this.filterDepth(documents); documents = this.removeChildLinks(documents); return documents.splice(0, this.limit); } From 26a092f780a0a9e59aba21f6c7a5a92291e178ef Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 21:04:49 -0700 Subject: [PATCH 3/3] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index a3ea6b6..5745fdb 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -239,7 +239,6 @@ export class WebScraperDataProvider { links: string[] ): Promise { await this.setCachedDocuments(documents, links); - documents = this.filterDocsExcludeInclude(documents); documents = this.removeChildLinks(documents); return documents.splice(0, this.limit); }