Merge pull request #238 from mendableai/feat/better-gdrive-pdf-fetch

[Feat] Improved the scrape for gdrive pdfs
2024-06-05 10:20:46 -07:00 · 2024-06-05 10:20:46 -07:00 · 9640bf087e
commit 9640bf087e
parent ff53db8c6d 7cb14edec8
3 changed files with 21 additions and 11 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -185,7 +185,7 @@ describe("E2E Tests for API Routes", () => {
      );
    });

-    it("should return a successful response with a valid API key", async () => {
+    it("should return a successful response with a valid API key for crawl", async () => {
      const response = await request(TEST_URL)
        .post("/v0/crawl")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -529,7 +529,7 @@ describe("E2E Tests for API Routes", () => {
      expect(response.statusCode).toBe(408);
    }, 3000); 

-    it("should return a successful response with a valid API key", async () => {
+    it("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
      const response = await request(TEST_URL)
        .post("/v0/crawlWebsitePreview")
        .set("Authorization", `Bearer this_is_just_a_preview_token`)
@ -558,7 +558,7 @@ describe("E2E Tests for API Routes", () => {
      expect(response.statusCode).toBe(401);
    });

-    it("should return a successful response with a valid API key", async () => {
+    it("should return a successful response with a valid API key for search", async () => {
      const response = await request(TEST_URL)
        .post("/v0/search")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -1,7 +1,9 @@
+import { fetchAndProcessPdf } from "../utils/pdfProcessor";
+
 export async function handleCustomScraping(
  text: string,
  url: string
-): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
+): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
  // Check for Readme Docs special case
  if (text.includes('<meta name="readme-deploy"')) {
    console.log(
@ -31,16 +33,19 @@ export async function handleCustomScraping(

  // Check for Google Drive PDF links in the raw HTML
  const googleDrivePdfPattern =
-    /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
+    /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
  const googleDrivePdfLink = text.match(googleDrivePdfPattern);
  if (googleDrivePdfLink) {
    console.log(
      `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
    );
+
+    const fileId = googleDrivePdfLink[1];
+    const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
+
    return {
-      scraper: "fire-engine",
-      url: url,
-      waitAfterLoad: 1000,
+      scraper: "pdf",
+      url: pdfUrl
    };
  }
  
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -334,7 +334,12 @@ export async function scrapSingleUrl(
    const customScraperResult = await handleCustomScraping(text, url);

    if (customScraperResult){
+      switch (customScraperResult.scraper) {
+        case "fire-engine":
          customScrapedContent  = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
+        case "pdf":
+          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
+      }
    }

    if (customScrapedContent) {