[feat] improved the scrape for gdrive pdfs

2024-06-04 17:47:28 -03:00 · 2024-06-04 17:47:28 -03:00 · b5045d1661
commit b5045d1661
parent a547f9a78e
2 changed files with 18 additions and 8 deletions
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -1,7 +1,9 @@
 import { fetchAndProcessPdf } from "../utils/pdfProcessor";
 export async function handleCustomScraping(
  text: string,
  url: string
-): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
+): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
  // Check for Readme Docs special case
  if (text.includes('<meta name="readme-deploy"')) {
    console.log(
@ -28,16 +30,19 @@ export async function handleCustomScraping(
  // Check for Google Drive PDF links in the raw HTML
  const googleDrivePdfPattern =
-    /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
+    /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
  const googleDrivePdfLink = text.match(googleDrivePdfPattern);
  if (googleDrivePdfLink) {
    console.log(
      `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
    );
    const fileId = googleDrivePdfLink[1];
    const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
    return {
-      scraper: "fire-engine",
+      scraper: "pdf",
-      url: url,
+      url: pdfUrl
      wait_after_load: 1000,
    };
  }
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -332,7 +332,12 @@ export async function scrapSingleUrl(
    const customScraperResult = await handleCustomScraping(text, url);
    if (customScraperResult){
      switch (customScraperResult.scraper) {
        case "fire-engine":
          customScrapedContent  = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
        case "pdf":
          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
      }
    }
    if (customScrapedContent) {