From b5045d1661741eda6d137bdb172b185ce748fd62 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 4 Jun 2024 17:47:28 -0300
Subject: [PATCH 1/2] [feat] improved the scrape for gdrive pdfs
---
.../WebScraper/custom/handleCustomScraping.ts | 17 +++++++++++------
apps/api/src/scraper/WebScraper/single_url.ts | 9 +++++++--
2 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index 5f6c34f..1301757 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -1,7 +1,9 @@
+import { fetchAndProcessPdf } from "../utils/pdfProcessor";
+
export async function handleCustomScraping(
text: string,
url: string
-): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
+): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
// Check for Readme Docs special case
if (text.includes('
Date: Wed, 5 Jun 2024 10:13:52 -0700
Subject: [PATCH 2/2] Nick:
---
apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++---
.../src/scraper/WebScraper/custom/handleCustomScraping.ts | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 2042abf..f015acd 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -185,7 +185,7 @@ describe("E2E Tests for API Routes", () => {
);
});
- it("should return a successful response with a valid API key", async () => {
+ it("should return a successful response with a valid API key for crawl", async () => {
const response = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@@ -529,7 +529,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(408);
}, 3000);
- it("should return a successful response with a valid API key", async () => {
+ it("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
.set("Authorization", `Bearer this_is_just_a_preview_token`)
@@ -558,7 +558,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(401);
});
- it("should return a successful response with a valid API key", async () => {
+ it("should return a successful response with a valid API key for search", async () => {
const response = await request(TEST_URL)
.post("/v0/search")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index 33e0783..8108a9e 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -3,7 +3,7 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping(
text: string,
url: string
-): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
+): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case
if (text.includes('