0

[feat] improved the scrape for gdrive pdfs

This commit is contained in:
rafaelsideguide 2024-06-04 17:47:28 -03:00
parent a547f9a78e
commit b5045d1661
2 changed files with 18 additions and 8 deletions

View File

@ -1,7 +1,9 @@
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
url: string url: string
): Promise<{ scraper: string; url: string; wait_after_load: number } | null> { ): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
// Check for Readme Docs special case // Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log( console.log(
@ -28,16 +30,19 @@ export async function handleCustomScraping(
// Check for Google Drive PDF links in the raw HTML // Check for Google Drive PDF links in the raw HTML
const googleDrivePdfPattern = const googleDrivePdfPattern =
/https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/; /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
const googleDrivePdfLink = text.match(googleDrivePdfPattern); const googleDrivePdfLink = text.match(googleDrivePdfPattern);
if (googleDrivePdfLink) { if (googleDrivePdfLink) {
console.log( console.log(
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}` `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
); );
const fileId = googleDrivePdfLink[1];
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
return { return {
scraper: "fire-engine", scraper: "pdf",
url: url, url: pdfUrl
wait_after_load: 1000,
}; };
} }

View File

@ -331,8 +331,13 @@ export async function scrapSingleUrl(
// Check for custom scraping conditions // Check for custom scraping conditions
const customScraperResult = await handleCustomScraping(text, url); const customScraperResult = await handleCustomScraping(text, url);
if(customScraperResult){ if (customScraperResult){
switch (customScraperResult.scraper) {
case "fire-engine":
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load) customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
}
} }
if (customScrapedContent) { if (customScrapedContent) {