From 43cfcec326645bda17b04ecd5ec2372d3cb69d8d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 13:12:08 -0700 Subject: [PATCH] Nick: disabling in crawl and sitemap for now --- apps/api/src/scraper/WebScraper/index.ts | 12 ++++++------ .../api/src/scraper/WebScraper/utils/pdfProcessor.ts | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 47fa05c..58144ba 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -88,7 +88,7 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter((link) => isUrlAPdf(link)); + let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -98,7 +98,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf(link)); + links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); @@ -157,7 +157,7 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { - let pdfLinks = this.urls.filter((link) => isUrlAPdf(link)); + let pdfLinks = this.urls.filter((link) => isUrlAPdf({url: link, fastMode: false})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -169,7 +169,7 @@ export class WebScraperDataProvider { } let documents = await this.convertUrlsToDocuments( - this.urls.filter((link) => !isUrlAPdf(link)), + this.urls.filter((link) => !isUrlAPdf({url: link, fastMode: true})), inProgress ); @@ -193,7 +193,7 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter((link) => isUrlAPdf(link)); + let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -203,7 +203,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf(link)); + links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 75f80fb..2d0203f 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -112,11 +112,15 @@ async function processPdf(file: string){ // console.log(e); // }) -export async function isUrlAPdf(url: string): Promise { +export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise { try { if (url.endsWith('.pdf')) { return true; } + // If fast mode is enabled, we skip the HEAD request and return false + if (fastMode) { + return false; + } const response = await fetch(url, { method: 'HEAD' }); const contentType = response.headers.get('Content-Type'); return contentType !== null && contentType.includes('application/pdf');