0

Merge pull request #238 from mendableai/feat/better-gdrive-pdf-fetch

[Feat] Improved the scrape for gdrive pdfs
This commit is contained in:
Nicolas 2024-06-05 10:20:46 -07:00 committed by GitHub
commit 9640bf087e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 21 additions and 11 deletions

View File

@ -185,7 +185,7 @@ describe("E2E Tests for API Routes", () => {
); );
}); });
it("should return a successful response with a valid API key", async () => { it("should return a successful response with a valid API key for crawl", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -529,7 +529,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(408); expect(response.statusCode).toBe(408);
}, 3000); }, 3000);
it("should return a successful response with a valid API key", async () => { it("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview") .post("/v0/crawlWebsitePreview")
.set("Authorization", `Bearer this_is_just_a_preview_token`) .set("Authorization", `Bearer this_is_just_a_preview_token`)
@ -558,7 +558,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it("should return a successful response with a valid API key", async () => { it("should return a successful response with a valid API key for search", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/search") .post("/v0/search")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)

View File

@ -1,7 +1,9 @@
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
url: string url: string
): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> { ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case // Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log( console.log(
@ -31,16 +33,19 @@ export async function handleCustomScraping(
// Check for Google Drive PDF links in the raw HTML // Check for Google Drive PDF links in the raw HTML
const googleDrivePdfPattern = const googleDrivePdfPattern =
/https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/; /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
const googleDrivePdfLink = text.match(googleDrivePdfPattern); const googleDrivePdfLink = text.match(googleDrivePdfPattern);
if (googleDrivePdfLink) { if (googleDrivePdfLink) {
console.log( console.log(
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}` `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
); );
const fileId = googleDrivePdfLink[1];
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
return { return {
scraper: "fire-engine", scraper: "pdf",
url: url, url: pdfUrl
waitAfterLoad: 1000,
}; };
} }

View File

@ -334,7 +334,12 @@ export async function scrapSingleUrl(
const customScraperResult = await handleCustomScraping(text, url); const customScraperResult = await handleCustomScraping(text, url);
if (customScraperResult){ if (customScraperResult){
switch (customScraperResult.scraper) {
case "fire-engine":
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions) customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
}
} }
if (customScrapedContent) { if (customScrapedContent) {