From 8eb2e95f19b4f5389f8447ccbd961ce53dc1391a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 13 May 2024 16:13:10 -0300 Subject: [PATCH] Cleaned up --- apps/api/src/scraper/WebScraper/index.ts | 26 +------------- apps/api/src/scraper/WebScraper/single_url.ts | 23 +++++++++---- .../scraper/WebScraper/utils/pdfProcessor.ts | 34 +------------------ 3 files changed, 18 insertions(+), 65 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index de941e0..1d9656e 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/imageDescription"; -import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths, @@ -144,11 +144,7 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } - // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); - // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - let documents = await this.processLinks(links, inProgress); - // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } @@ -156,11 +152,8 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { const links = this.urls; - // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); - // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); let documents = await this.processLinks(links, inProgress); - // documents.push(...pdfDocuments); return documents; } @@ -172,11 +165,7 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } - // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); - // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - let documents = await this.processLinks(links, inProgress); - // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } @@ -233,19 +222,6 @@ export class WebScraperDataProvider { ); } - private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> { - const checks = links.map(async (link) => ({ - link, - isPdf: await isUrlAPdf({ url: link }) - })); - - const results = await Promise.all(checks); - const pdfLinks = results.filter(result => result.isPdf).map(result => result.link); - const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link); - - return [pdfLinks, notPdfLinks]; - } - private applyPathReplacements(documents: Document[]): Document[] { return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 33d8518..baf465e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -67,13 +67,11 @@ export async function scrapWithScrapingBee( ); return ""; } - // Check the content type of the response + const contentType = response.headers['content-type']; if (contentType && contentType.includes('application/pdf')) { - // Handle PDF content type return fetchAndProcessPdf(url); } else { - // Assume the content is text and decode it const decoder = new TextDecoder(); const text = decoder.decode(response.data); return text; @@ -104,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise { return ""; } - const data = await response.json(); - const html = data.content; - return html ?? ""; + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const data = await response.json(); + const html = data.content; + return html ?? ""; + } } catch (error) { console.error(`Error scraping with Puppeteer: ${error}`); return ""; @@ -173,7 +176,13 @@ export async function scrapSingleUrl( ); return ""; } - text = await response.text(); + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + text = await response.text(); + } } catch (error) { console.error(`Error scraping URL: ${error}`); return ""; diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index a72de30..ba92fd4 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -105,36 +105,4 @@ async function processPdf(file: string) { const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text; -} -/** - * Check if a url is a pdf - * @param url The url to check - * @param fastMode If true, the function will return false if the url is does not end with .pdf - * @returns A promise that resolves to true if the url is a pdf, false otherwise - */ -export async function isUrlAPdf({ - url, - fastMode = false, -}: { - url: string; - fastMode?: boolean; -}): Promise { - try { - if (url.endsWith(".pdf")) { - return true; - } - // If fast mode is enabled, we skip the HEAD request and return false - if (fastMode) { - return false; - } - const before = Date.now(); - const response = await axios.head(url); - const after = Date.now(); - console.log(`${after - before}ms - HEAD Request for ${url}`); - const contentType = response.headers['content-type']; - return contentType.includes('application/pdf'); - } catch (error) { - // console.error("Error making HEAD request:", error); - return false; - } -} +} \ No newline at end of file