From b5045d1661741eda6d137bdb172b185ce748fd62 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 4 Jun 2024 17:47:28 -0300 Subject: [PATCH 1/2] [feat] improved the scrape for gdrive pdfs --- .../WebScraper/custom/handleCustomScraping.ts | 17 +++++++++++------ apps/api/src/scraper/WebScraper/single_url.ts | 9 +++++++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index 5f6c34f..1301757 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,7 +1,9 @@ +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; + export async function handleCustomScraping( text: string, url: string -): Promise<{ scraper: string; url: string; wait_after_load: number } | null> { +): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> { // Check for Readme Docs special case if (text.includes(' Date: Wed, 5 Jun 2024 10:13:52 -0700 Subject: [PATCH 2/2] Nick: --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++--- .../src/scraper/WebScraper/custom/handleCustomScraping.ts | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2042abf..f015acd 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -185,7 +185,7 @@ describe("E2E Tests for API Routes", () => { ); }); - it("should return a successful response with a valid API key", async () => { + it("should return a successful response with a valid API key for crawl", async () => { const response = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -529,7 +529,7 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(408); }, 3000); - it("should return a successful response with a valid API key", async () => { + it("should return a successful response with a valid API key for crawlWebsitePreview", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") .set("Authorization", `Bearer this_is_just_a_preview_token`) @@ -558,7 +558,7 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it("should return a successful response with a valid API key", async () => { + it("should return a successful response with a valid API key for search", async () => { const response = await request(TEST_URL) .post("/v0/search") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index 33e0783..8108a9e 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -3,7 +3,7 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor"; export async function handleCustomScraping( text: string, url: string -): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> { +): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { // Check for Readme Docs special case if (text.includes('