From 140529c6090ece93ba60cdb3ce360f9a28b8ffb7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 13:05:21 -0700 Subject: [PATCH 01/12] Nick: fixes pdfs not found --- apps/api/src/scraper/WebScraper/index.ts | 14 +++++------ .../scraper/WebScraper/utils/pdfProcessor.ts | 23 ++++++++++++++++++- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c2146be..47fa05c 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -5,7 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; -import { fetchAndProcessPdf } from "./utils/pdfProcessor"; +import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; @@ -88,7 +88,7 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfLinks = links.filter((link) => isUrlAPdf(link)); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -98,7 +98,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !link.endsWith(".pdf")); + links = links.filter((link) => !isUrlAPdf(link)); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); @@ -157,7 +157,7 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { - let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf")); + let pdfLinks = this.urls.filter((link) => isUrlAPdf(link)); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -169,7 +169,7 @@ export class WebScraperDataProvider { } let documents = await this.convertUrlsToDocuments( - this.urls.filter((link) => !link.endsWith(".pdf")), + this.urls.filter((link) => !isUrlAPdf(link)), inProgress ); @@ -193,7 +193,7 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfLinks = links.filter((link) => isUrlAPdf(link)); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -203,7 +203,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !link.endsWith(".pdf")); + links = links.filter((link) => !isUrlAPdf(link)); let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index fb08d9c..75f80fb 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -35,6 +35,7 @@ async function downloadPdf(url: string): Promise { } export async function processPdfToText(filePath: string): Promise { + let content = ""; if (process.env.LLAMAPARSE_API_KEY) { @@ -105,4 +106,24 @@ async function processPdf(file: string){ const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text; -} \ No newline at end of file +} + +// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{ +// console.log(e); +// }) + +export async function isUrlAPdf(url: string): Promise { + try { + if (url.endsWith('.pdf')) { + return true; + } + const response = await fetch(url, { method: 'HEAD' }); + const contentType = response.headers.get('Content-Type'); + return contentType !== null && contentType.includes('application/pdf'); + } catch (error) { + console.error('Error making HEAD request:', error); + return false; + } +} + + From 43cfcec326645bda17b04ecd5ec2372d3cb69d8d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 13:12:08 -0700 Subject: [PATCH 02/12] Nick: disabling in crawl and sitemap for now --- apps/api/src/scraper/WebScraper/index.ts | 12 ++++++------ .../api/src/scraper/WebScraper/utils/pdfProcessor.ts | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 47fa05c..58144ba 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -88,7 +88,7 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter((link) => isUrlAPdf(link)); + let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -98,7 +98,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf(link)); + links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); @@ -157,7 +157,7 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { - let pdfLinks = this.urls.filter((link) => isUrlAPdf(link)); + let pdfLinks = this.urls.filter((link) => isUrlAPdf({url: link, fastMode: false})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -169,7 +169,7 @@ export class WebScraperDataProvider { } let documents = await this.convertUrlsToDocuments( - this.urls.filter((link) => !isUrlAPdf(link)), + this.urls.filter((link) => !isUrlAPdf({url: link, fastMode: true})), inProgress ); @@ -193,7 +193,7 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter((link) => isUrlAPdf(link)); + let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -203,7 +203,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf(link)); + links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 75f80fb..2d0203f 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -112,11 +112,15 @@ async function processPdf(file: string){ // console.log(e); // }) -export async function isUrlAPdf(url: string): Promise { +export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise { try { if (url.endsWith('.pdf')) { return true; } + // If fast mode is enabled, we skip the HEAD request and return false + if (fastMode) { + return false; + } const response = await fetch(url, { method: 'HEAD' }); const contentType = response.headers.get('Content-Type'); return contentType !== null && contentType.includes('application/pdf'); From c5cb268b61cd9b1fe7035b8d6a72bc80cfe3d4e2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 13:13:42 -0700 Subject: [PATCH 03/12] Update pdfProcessor.ts --- .../scraper/WebScraper/utils/pdfProcessor.ts | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 2d0203f..80476e9 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise { async function downloadPdf(url: string): Promise { const response = await axios({ url, - method: 'GET', - responseType: 'stream', + method: "GET", + responseType: "stream", }); const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); @@ -29,13 +29,12 @@ async function downloadPdf(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on('finish', () => resolve(tempFilePath)); - writer.on('error', reject); + writer.on("finish", () => resolve(tempFilePath)); + writer.on("error", reject); }); } export async function processPdfToText(filePath: string): Promise { - let content = ""; if (process.env.LLAMAPARSE_API_KEY) { @@ -102,32 +101,37 @@ export async function processPdfToText(filePath: string): Promise { return content; } -async function processPdf(file: string){ +async function processPdf(file: string) { const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text; } - -// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{ -// console.log(e); -// }) - -export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise { +/** + * Check if a url is a pdf + * @param url The url to check + * @param fastMode If true, the function will return false if the url is does not end with .pdf + * @returns A promise that resolves to true if the url is a pdf, false otherwise + */ +export async function isUrlAPdf({ + url, + fastMode, +}: { + url: string; + fastMode: boolean; +}): Promise { try { - if (url.endsWith('.pdf')) { + if (url.endsWith(".pdf")) { return true; } // If fast mode is enabled, we skip the HEAD request and return false if (fastMode) { return false; } - const response = await fetch(url, { method: 'HEAD' }); - const contentType = response.headers.get('Content-Type'); - return contentType !== null && contentType.includes('application/pdf'); + const response = await fetch(url, { method: "HEAD" }); + const contentType = response.headers.get("Content-Type"); + return contentType !== null && contentType.includes("application/pdf"); } catch (error) { - console.error('Error making HEAD request:', error); + console.error("Error making HEAD request:", error); return false; } } - - From 5b937991491d24ebd0411e08ecd9601dc309a87e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 15:13:17 -0700 Subject: [PATCH 04/12] Nick: a bit faster --- apps/api/src/scraper/WebScraper/index.ts | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 58144ba..0dc68b0 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -157,19 +157,23 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { - let pdfLinks = this.urls.filter((link) => isUrlAPdf({url: link, fastMode: false})); let pdfDocuments: Document[] = []; - for (let pdfLink of pdfLinks) { - const pdfContent = await fetchAndProcessPdf(pdfLink); - pdfDocuments.push({ - content: pdfContent, - metadata: { sourceURL: pdfLink }, - provider: "web-scraper" - }); + let nonPdfUrls: string[] = []; + for (let url of this.urls) { + if (isUrlAPdf({url: url, fastMode: false})) { + const pdfContent = await fetchAndProcessPdf(url); + pdfDocuments.push({ + content: pdfContent, + metadata: { sourceURL: url }, + provider: "web-scraper" + }); + } else { + nonPdfUrls.push(url); + } } let documents = await this.convertUrlsToDocuments( - this.urls.filter((link) => !isUrlAPdf({url: link, fastMode: true})), + nonPdfUrls, inProgress ); From 84cebf618bb316f7494b2c0c350a22d66528f698 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 15:36:00 -0700 Subject: [PATCH 05/12] Nick: --- apps/api/src/__tests__/e2e/index.test.ts | 3 +++ apps/api/src/scraper/WebScraper/index.ts | 13 ++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 554453b..9e7a75f 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -42,6 +42,7 @@ describe('E2E Tests for API Routes', () => { .set('Authorization', `Bearer this_is_just_a_preview_token`) .set('Content-Type', 'application/json') .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(200); }, 10000); // 10 seconds timeout @@ -51,6 +52,8 @@ describe('E2E Tests for API Routes', () => { .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') .send({ url: 'https://firecrawl.dev' }); + await new Promise((r) => setTimeout(r, 2000)); + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty('data'); expect(response.body.data).toHaveProperty('content'); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 0dc68b0..9d9a236 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -88,7 +88,7 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); + let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -98,7 +98,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); + links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); @@ -157,10 +157,12 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { + console.log("Single urls mode"); let pdfDocuments: Document[] = []; let nonPdfUrls: string[] = []; for (let url of this.urls) { - if (isUrlAPdf({url: url, fastMode: false})) { + console.log("Checking if url is a pdf", url); + if (await isUrlAPdf({url: url, fastMode: false})) { const pdfContent = await fetchAndProcessPdf(url); pdfDocuments.push({ content: pdfContent, @@ -169,6 +171,7 @@ export class WebScraperDataProvider { }); } else { nonPdfUrls.push(url); + console.log("Fetching and processing url", url); } } @@ -197,7 +200,7 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); + let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -207,7 +210,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); + links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), From f1dd97af0f0c98dd46b3355ccd488c420acdb97e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 15:37:27 -0700 Subject: [PATCH 06/12] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 40 ++++++++++++++---------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 9d9a236..fe291fb 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -6,8 +6,10 @@ import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor"; -import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; - +import { + replaceImgPathsWithAbsolutePaths, + replacePathsWithAbsolutePaths, +} from "./utils/replacePaths"; export class WebScraperDataProvider { private urls: string[] = [""]; @@ -36,8 +38,6 @@ export class WebScraperDataProvider { ): Promise { const totalUrls = urls.length; let processedUrls = 0; - console.log("Converting urls to documents"); - console.log("Total urls", urls); const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); @@ -88,17 +88,21 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true})); + let pdfLinks = links.filter( + async (link) => await isUrlAPdf({ url: link, fastMode: true }) + ); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); pdfDocuments.push({ content: pdfContent, metadata: { sourceURL: pdfLink }, - provider: "web-scraper" + provider: "web-scraper", }); } - links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true})); + links = links.filter( + async (link) => !(await isUrlAPdf({ url: link, fastMode: true })) + ); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); @@ -157,21 +161,18 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { - console.log("Single urls mode"); let pdfDocuments: Document[] = []; let nonPdfUrls: string[] = []; for (let url of this.urls) { - console.log("Checking if url is a pdf", url); - if (await isUrlAPdf({url: url, fastMode: false})) { + if (await isUrlAPdf({ url: url, fastMode: false })) { const pdfContent = await fetchAndProcessPdf(url); pdfDocuments.push({ content: pdfContent, metadata: { sourceURL: url }, - provider: "web-scraper" + provider: "web-scraper", }); } else { nonPdfUrls.push(url); - console.log("Fetching and processing url", url); } } @@ -200,17 +201,21 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true})); + let pdfLinks = links.filter( + async (link) => await isUrlAPdf({ url: link, fastMode: true }) + ); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); pdfDocuments.push({ content: pdfContent, metadata: { sourceURL: pdfLink }, - provider: "web-scraper" + provider: "web-scraper", }); } - links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true})); + links = links.filter( + async (link) => !(await isUrlAPdf({ url: link, fastMode: true })) + ); let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), @@ -377,8 +382,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false }; + this.replaceAllPathsWithAbsolutePaths = + options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); From f8b207793f6f48d3b974b43e27425477407b28a3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 29 Apr 2024 15:15:32 -0300 Subject: [PATCH 07/12] changed the request to do a HEAD to check for a PDF instead --- apps/api/src/__tests__/e2e/index.test.ts | 39 ++++++++++++++++++ apps/api/src/scraper/WebScraper/index.ts | 40 ++++++++++++------- .../scraper/WebScraper/utils/pdfProcessor.ts | 12 +++--- 3 files changed, 70 insertions(+), 21 deletions(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 9e7a75f..a652619 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -170,6 +170,45 @@ describe('E2E Tests for API Routes', () => { expect(completedResponse.body.data[0]).toHaveProperty('metadata'); expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl'); }, 60000); // 60 seconds + + // it('should return a successful response for a valid crawl job with PDF content', async () => { + + // }); + + it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { + const crawlResponse = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }}); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + expect(response.body.status).toBe('active'); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 60000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + console.log(completedResponse.body.data) + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty('status'); + expect(completedResponse.body.status).toBe('completed'); + expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body.data.length).toBeGreaterThan(1); + expect(completedResponse.body.data[0]).toHaveProperty('content'); + expect(completedResponse.body.data[0]).toHaveProperty('markdown'); + expect(completedResponse.body.data[0]).toHaveProperty('metadata'); + expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208'); + }, 90000); // 60 seconds + + }); describe('GET /is-production', () => { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fe291fb..2b02076 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -88,9 +88,17 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter( - async (link) => await isUrlAPdf({ url: link, fastMode: true }) - ); + let pdfLinks = []; + let notPdfLinks = []; + for (let link of links) { + if (await isUrlAPdf({ url: link })) { + pdfLinks.push(link); + } else { + notPdfLinks.push(link); + } + } + + console.log("crawl", {pdfLinks}) let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -100,11 +108,8 @@ export class WebScraperDataProvider { provider: "web-scraper", }); } - links = links.filter( - async (link) => !(await isUrlAPdf({ url: link, fastMode: true })) - ); - let documents = await this.convertUrlsToDocuments(links, inProgress); + let documents = await this.convertUrlsToDocuments(notPdfLinks, inProgress); documents = await this.getSitemapData(this.urls[0], documents); if (this.replaceAllPathsWithAbsolutePaths) { @@ -164,7 +169,7 @@ export class WebScraperDataProvider { let pdfDocuments: Document[] = []; let nonPdfUrls: string[] = []; for (let url of this.urls) { - if (await isUrlAPdf({ url: url, fastMode: false })) { + if (await isUrlAPdf({ url: url })) { const pdfContent = await fetchAndProcessPdf(url); pdfDocuments.push({ content: pdfContent, @@ -201,9 +206,17 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter( - async (link) => await isUrlAPdf({ url: link, fastMode: true }) - ); + + let pdfLinks = []; + let nonPdfLinks = []; + for (let link of links) { + if (await isUrlAPdf({ url: link })) { + pdfLinks.push(link); + } else { + nonPdfLinks.push(link); + } + } + let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -213,12 +226,9 @@ export class WebScraperDataProvider { provider: "web-scraper", }); } - links = links.filter( - async (link) => !(await isUrlAPdf({ url: link, fastMode: true })) - ); let documents = await this.convertUrlsToDocuments( - links.slice(0, this.limit), + nonPdfLinks.slice(0, this.limit), inProgress ); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 80476e9..67fb134 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -114,10 +114,10 @@ async function processPdf(file: string) { */ export async function isUrlAPdf({ url, - fastMode, + fastMode = false, }: { url: string; - fastMode: boolean; + fastMode?: boolean; }): Promise { try { if (url.endsWith(".pdf")) { @@ -127,11 +127,11 @@ export async function isUrlAPdf({ if (fastMode) { return false; } - const response = await fetch(url, { method: "HEAD" }); - const contentType = response.headers.get("Content-Type"); - return contentType !== null && contentType.includes("application/pdf"); + const response = await axios.head(url); + const contentType = response.headers['content-type']; + return contentType.includes('application/pdf'); } catch (error) { - console.error("Error making HEAD request:", error); + // console.error("Error making HEAD request:", error); return false; } } From 35480bd2ad4554c64577d01e690922d7d72d974f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 30 Apr 2024 10:40:32 -0300 Subject: [PATCH 08/12] Update index.test.ts --- apps/api/src/__tests__/e2e/index.test.ts | 49 ++++++++++++++++++++---- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index a652619..0ceca19 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -61,6 +61,36 @@ describe('E2E Tests for API Routes', () => { expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('🔥 FireCrawl'); }, 30000); // 30 seconds timeout + + it('should return a successful response for a valid scrape with PDF file', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds + + it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds }); describe('POST /v0/crawl', () => { @@ -180,7 +210,7 @@ describe('E2E Tests for API Routes', () => { .post('/v0/crawl') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }}); + .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) @@ -191,22 +221,25 @@ describe('E2E Tests for API Routes', () => { expect(response.body.status).toBe('active'); // wait for 30 seconds - await new Promise((r) => setTimeout(r, 60000)); + await new Promise((r) => setTimeout(r, 30000)); const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - console.log(completedResponse.body.data) + expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty('status'); expect(completedResponse.body.status).toBe('completed'); expect(completedResponse.body).toHaveProperty('data'); expect(completedResponse.body.data.length).toBeGreaterThan(1); - expect(completedResponse.body.data[0]).toHaveProperty('content'); - expect(completedResponse.body.data[0]).toHaveProperty('markdown'); - expect(completedResponse.body.data[0]).toHaveProperty('metadata'); - expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208'); - }, 90000); // 60 seconds + expect(completedResponse.body.data).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') + }) + ]) + ); + }, 60000); // 60 seconds }); From f4348024c61e9ce15feeb0928d4d87a91a3f352e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 13 May 2024 09:13:42 -0300 Subject: [PATCH 09/12] Added check during scraping to deal with pdfs Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy. --- .../src/__tests__/e2e_withAuth/index.test.ts | 12 +++----- apps/api/src/scraper/WebScraper/index.ts | 28 ++++++++++++++++++- apps/api/src/scraper/WebScraper/single_url.ts | 15 ++++++++-- .../scraper/WebScraper/utils/pdfProcessor.ts | 9 ++++-- 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index a49b169..d69a70b 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -117,7 +117,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds + }, 60000); // 60 seconds it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { const response = await request(TEST_URL) @@ -132,7 +132,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds + }, 60000); // 60 seconds }); describe("POST /v0/crawl", () => { @@ -427,10 +427,8 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://jestjs.io" }); expect(crawlResponse.statusCode).toBe(200); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 10000)); + await new Promise((r) => setTimeout(r, 20000)); const response = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) @@ -439,7 +437,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body).toHaveProperty("status"); expect(response.body.status).toBe("cancelled"); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) @@ -456,8 +454,6 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds - - describe("POST /v0/scrape with LLM Extraction", () => { it("should extract data using LLM extraction mode", async () => { const response = await request(TEST_URL) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 73eda44..de941e0 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -144,14 +144,23 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } + // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); + // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + let documents = await this.processLinks(links, inProgress); + // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } private async handleSingleUrlsMode( inProgress?: (progress: Progress) => void ): Promise { - let documents = await this.processLinks(this.urls, inProgress); + const links = this.urls; + // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); + // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + + let documents = await this.processLinks(links, inProgress); + // documents.push(...pdfDocuments); return documents; } @@ -163,7 +172,11 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } + // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); + // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + let documents = await this.processLinks(links, inProgress); + // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } @@ -220,6 +233,19 @@ export class WebScraperDataProvider { ); } + private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> { + const checks = links.map(async (link) => ({ + link, + isPdf: await isUrlAPdf({ url: link }) + })); + + const results = await Promise.all(checks); + const pdfLinks = results.filter(result => result.isPdf).map(result => result.link); + const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link); + + return [pdfLinks, notPdfLinks]; + } + private applyPathReplacements(documents: Document[]): Document[] { return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c43ea40..33d8518 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; dotenv.config(); @@ -66,9 +67,17 @@ export async function scrapWithScrapingBee( ); return ""; } - const decoder = new TextDecoder(); - const text = decoder.decode(response.data); - return text; + // Check the content type of the response + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + // Handle PDF content type + return fetchAndProcessPdf(url); + } else { + // Assume the content is text and decode it + const decoder = new TextDecoder(); + const text = decoder.decode(response.data); + return text; + } } catch (error) { console.error(`Error scraping with Scraping Bee: ${error}`); return ""; diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 67fb134..a72de30 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise { } else { // If the status code is not 200, increment the attempt counter and wait attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result:", error); + console.error("Error fetching result:", error.data.detail || ''); attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently } } @@ -127,7 +127,10 @@ export async function isUrlAPdf({ if (fastMode) { return false; } + const before = Date.now(); const response = await axios.head(url); + const after = Date.now(); + console.log(`${after - before}ms - HEAD Request for ${url}`); const contentType = response.headers['content-type']; return contentType.includes('application/pdf'); } catch (error) { From 8eb2e95f19b4f5389f8447ccbd961ce53dc1391a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 13 May 2024 16:13:10 -0300 Subject: [PATCH 10/12] Cleaned up --- apps/api/src/scraper/WebScraper/index.ts | 26 +------------- apps/api/src/scraper/WebScraper/single_url.ts | 23 +++++++++---- .../scraper/WebScraper/utils/pdfProcessor.ts | 34 +------------------ 3 files changed, 18 insertions(+), 65 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index de941e0..1d9656e 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/imageDescription"; -import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths, @@ -144,11 +144,7 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } - // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); - // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - let documents = await this.processLinks(links, inProgress); - // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } @@ -156,11 +152,8 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { const links = this.urls; - // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); - // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); let documents = await this.processLinks(links, inProgress); - // documents.push(...pdfDocuments); return documents; } @@ -172,11 +165,7 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } - // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); - // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - let documents = await this.processLinks(links, inProgress); - // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } @@ -233,19 +222,6 @@ export class WebScraperDataProvider { ); } - private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> { - const checks = links.map(async (link) => ({ - link, - isPdf: await isUrlAPdf({ url: link }) - })); - - const results = await Promise.all(checks); - const pdfLinks = results.filter(result => result.isPdf).map(result => result.link); - const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link); - - return [pdfLinks, notPdfLinks]; - } - private applyPathReplacements(documents: Document[]): Document[] { return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 33d8518..baf465e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -67,13 +67,11 @@ export async function scrapWithScrapingBee( ); return ""; } - // Check the content type of the response + const contentType = response.headers['content-type']; if (contentType && contentType.includes('application/pdf')) { - // Handle PDF content type return fetchAndProcessPdf(url); } else { - // Assume the content is text and decode it const decoder = new TextDecoder(); const text = decoder.decode(response.data); return text; @@ -104,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise { return ""; } - const data = await response.json(); - const html = data.content; - return html ?? ""; + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const data = await response.json(); + const html = data.content; + return html ?? ""; + } } catch (error) { console.error(`Error scraping with Puppeteer: ${error}`); return ""; @@ -173,7 +176,13 @@ export async function scrapSingleUrl( ); return ""; } - text = await response.text(); + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + text = await response.text(); + } } catch (error) { console.error(`Error scraping URL: ${error}`); return ""; diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index a72de30..ba92fd4 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -105,36 +105,4 @@ async function processPdf(file: string) { const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text; -} -/** - * Check if a url is a pdf - * @param url The url to check - * @param fastMode If true, the function will return false if the url is does not end with .pdf - * @returns A promise that resolves to true if the url is a pdf, false otherwise - */ -export async function isUrlAPdf({ - url, - fastMode = false, -}: { - url: string; - fastMode?: boolean; -}): Promise { - try { - if (url.endsWith(".pdf")) { - return true; - } - // If fast mode is enabled, we skip the HEAD request and return false - if (fastMode) { - return false; - } - const before = Date.now(); - const response = await axios.head(url); - const after = Date.now(); - console.log(`${after - before}ms - HEAD Request for ${url}`); - const contentType = response.headers['content-type']; - return contentType.includes('application/pdf'); - } catch (error) { - // console.error("Error making HEAD request:", error); - return false; - } -} +} \ No newline at end of file From eb88447e8b9b1b16739f3b1357622fcfb90c5d53 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 17 May 2024 10:00:05 -0700 Subject: [PATCH 11/12] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index e6c4c48..3fe1022 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -587,22 +587,23 @@ describe("E2E Tests for API Routes", () => { .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty('status'); - expect(response.body.status).toBe('active'); - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty('status'); + if (response.body.status === 'completed') { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } expect(completedResponse.body.status).toBe('completed'); expect(completedResponse.body).toHaveProperty('data'); expect(completedResponse.body.data.length).toBeGreaterThan(1); @@ -626,18 +627,21 @@ describe("E2E Tests for API Routes", () => { }); expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + let isCompleted = false; + let completedResponse; + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } + } expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); From 5be208f5950dec85823bf9c82ab4e9084249aec2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 17 May 2024 10:40:44 -0700 Subject: [PATCH 12/12] Nick: fixed --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 4 ++-- apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 3fe1022..abe5c58 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -584,7 +584,7 @@ describe("E2E Tests for API Routes", () => { .post('/v0/crawl') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; @@ -606,7 +606,7 @@ describe("E2E Tests for API Routes", () => { } expect(completedResponse.body.status).toBe('completed'); expect(completedResponse.body).toHaveProperty('data'); - expect(completedResponse.body.data.length).toBeGreaterThan(1); + expect(completedResponse.body.data.length).toEqual(1); expect(completedResponse.body.data).toEqual( expect.arrayContaining([ expect.objectContaining({ diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index ba92fd4..7c57007 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -80,7 +80,7 @@ export async function processPdfToText(filePath: string): Promise { await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result:", error.data.detail || ''); + console.error("Error fetching result:", error || ''); attempt++; await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently