Merge pull request #238 from mendableai/feat/better-gdrive-pdf-fetch
[Feat] Improved the scrape for gdrive pdfs
This commit is contained in:
commit
9640bf087e
@ -185,7 +185,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return a successful response with a valid API key", async () => {
|
it("should return a successful response with a valid API key for crawl", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@ -529,7 +529,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(408);
|
expect(response.statusCode).toBe(408);
|
||||||
}, 3000);
|
}, 3000);
|
||||||
|
|
||||||
it("should return a successful response with a valid API key", async () => {
|
it("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/crawlWebsitePreview")
|
.post("/v0/crawlWebsitePreview")
|
||||||
.set("Authorization", `Bearer this_is_just_a_preview_token`)
|
.set("Authorization", `Bearer this_is_just_a_preview_token`)
|
||||||
@ -558,7 +558,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return a successful response with a valid API key", async () => {
|
it("should return a successful response with a valid API key for search", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/search")
|
.post("/v0/search")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
|
|
||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string
|
||||||
): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||||
// Check for Readme Docs special case
|
// Check for Readme Docs special case
|
||||||
if (text.includes('<meta name="readme-deploy"')) {
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
console.log(
|
console.log(
|
||||||
@ -31,18 +33,21 @@ export async function handleCustomScraping(
|
|||||||
|
|
||||||
// Check for Google Drive PDF links in the raw HTML
|
// Check for Google Drive PDF links in the raw HTML
|
||||||
const googleDrivePdfPattern =
|
const googleDrivePdfPattern =
|
||||||
/https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
|
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
|
||||||
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
|
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
|
||||||
if (googleDrivePdfLink) {
|
if (googleDrivePdfLink) {
|
||||||
console.log(
|
console.log(
|
||||||
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
|
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const fileId = googleDrivePdfLink[1];
|
||||||
|
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
scraper: "fire-engine",
|
scraper: "pdf",
|
||||||
url: url,
|
url: pdfUrl
|
||||||
waitAfterLoad: 1000,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -333,8 +333,13 @@ export async function scrapSingleUrl(
|
|||||||
// Check for custom scraping conditions
|
// Check for custom scraping conditions
|
||||||
const customScraperResult = await handleCustomScraping(text, url);
|
const customScraperResult = await handleCustomScraping(text, url);
|
||||||
|
|
||||||
if(customScraperResult){
|
if (customScraperResult){
|
||||||
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
|
switch (customScraperResult.scraper) {
|
||||||
|
case "fire-engine":
|
||||||
|
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
|
||||||
|
case "pdf":
|
||||||
|
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (customScrapedContent) {
|
if (customScrapedContent) {
|
||||||
|
Loading…
Reference in New Issue
Block a user