[feat] improved the scrape for gdrive pdfs
This commit is contained in:
parent
a547f9a78e
commit
b5045d1661
@ -1,7 +1,9 @@
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
url: string
|
||||
): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
|
||||
): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
|
||||
// Check for Readme Docs special case
|
||||
if (text.includes('<meta name="readme-deploy"')) {
|
||||
console.log(
|
||||
@ -28,16 +30,19 @@ export async function handleCustomScraping(
|
||||
|
||||
// Check for Google Drive PDF links in the raw HTML
|
||||
const googleDrivePdfPattern =
|
||||
/https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
|
||||
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
|
||||
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
|
||||
if (googleDrivePdfLink) {
|
||||
console.log(
|
||||
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
|
||||
);
|
||||
|
||||
const fileId = googleDrivePdfLink[1];
|
||||
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
|
||||
|
||||
return {
|
||||
scraper: "fire-engine",
|
||||
url: url,
|
||||
wait_after_load: 1000,
|
||||
scraper: "pdf",
|
||||
url: pdfUrl
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -331,8 +331,13 @@ export async function scrapSingleUrl(
|
||||
// Check for custom scraping conditions
|
||||
const customScraperResult = await handleCustomScraping(text, url);
|
||||
|
||||
if(customScraperResult){
|
||||
if (customScraperResult){
|
||||
switch (customScraperResult.scraper) {
|
||||
case "fire-engine":
|
||||
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
|
||||
case "pdf":
|
||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
||||
}
|
||||
}
|
||||
|
||||
if (customScrapedContent) {
|
||||
|
Loading…
Reference in New Issue
Block a user