From b5045d1661741eda6d137bdb172b185ce748fd62 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 4 Jun 2024 17:47:28 -0300
Subject: [PATCH 1/2] [feat] improved the scrape for gdrive pdfs

---
 .../WebScraper/custom/handleCustomScraping.ts   | 17 +++++++++++------
 apps/api/src/scraper/WebScraper/single_url.ts   |  9 +++++++--
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index 5f6c34f..1301757 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -1,7 +1,9 @@
+import { fetchAndProcessPdf } from "../utils/pdfProcessor";
+
 export async function handleCustomScraping(
   text: string,
   url: string
-): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
+): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
   // Check for Readme Docs special case
   if (text.includes('<meta name="readme-deploy"')) {
     console.log(
@@ -28,18 +30,21 @@ export async function handleCustomScraping(
 
   // Check for Google Drive PDF links in the raw HTML
   const googleDrivePdfPattern =
-    /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
+    /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
   const googleDrivePdfLink = text.match(googleDrivePdfPattern);
   if (googleDrivePdfLink) {
     console.log(
       `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
     );
+
+    const fileId = googleDrivePdfLink[1];
+    const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
+
     return {
-      scraper: "fire-engine",
-      url: url,
-      wait_after_load: 1000,
+      scraper: "pdf",
+      url: pdfUrl
     };
   }
-
+  
   return null;
 }
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 00c6958..10d32ce 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -331,8 +331,13 @@ export async function scrapSingleUrl(
     // Check for custom scraping conditions
     const customScraperResult = await handleCustomScraping(text, url);
 
-    if(customScraperResult){
-      customScrapedContent  = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
+    if (customScraperResult){
+      switch (customScraperResult.scraper) {
+        case "fire-engine":
+          customScrapedContent  = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
+        case "pdf":
+          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
+      }
     }
 
     if (customScrapedContent) {

From 7cb14edec81a42ba2a2297fa52544432ba9569d4 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 5 Jun 2024 10:13:52 -0700
Subject: [PATCH 2/2] Nick:

---
 apps/api/src/__tests__/e2e_withAuth/index.test.ts           | 6 +++---
 .../src/scraper/WebScraper/custom/handleCustomScraping.ts   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 2042abf..f015acd 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -185,7 +185,7 @@ describe("E2E Tests for API Routes", () => {
       );
     });
 
-    it("should return a successful response with a valid API key", async () => {
+    it("should return a successful response with a valid API key for crawl", async () => {
       const response = await request(TEST_URL)
         .post("/v0/crawl")
         .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@@ -529,7 +529,7 @@ describe("E2E Tests for API Routes", () => {
       expect(response.statusCode).toBe(408);
     }, 3000); 
 
-    it("should return a successful response with a valid API key", async () => {
+    it("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
       const response = await request(TEST_URL)
         .post("/v0/crawlWebsitePreview")
         .set("Authorization", `Bearer this_is_just_a_preview_token`)
@@ -558,7 +558,7 @@ describe("E2E Tests for API Routes", () => {
       expect(response.statusCode).toBe(401);
     });
 
-    it("should return a successful response with a valid API key", async () => {
+    it("should return a successful response with a valid API key for search", async () => {
       const response = await request(TEST_URL)
         .post("/v0/search")
         .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index 33e0783..8108a9e 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -3,7 +3,7 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
 export async function handleCustomScraping(
   text: string,
   url: string
-): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
+): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
   // Check for Readme Docs special case
   if (text.includes('<meta name="readme-deploy"')) {
     console.log(