0

Merge pull request #235 from mendableai/feat/gdrive-pdfs

[Feat] Added custom scraping for google-drive pdf usecase
This commit is contained in:
Nicolas 2024-06-04 11:31:53 -07:00 committed by GitHub
commit fc04d5b033
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -257,12 +257,22 @@ async function handleCustomScraping(
text: string, text: string,
url: string url: string
): Promise<FireEngineResponse | null> { ): Promise<FireEngineResponse | null> {
// Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log( console.log(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms` `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
); );
return await scrapWithFireEngine(url, 1000); return await scrapWithFireEngine(url, 1000);
} }
// Check for Google Drive PDF links in the raw HTML
const googleDrivePdfPattern = /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
if (googleDrivePdfLink) {
console.log(`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`);
return await scrapWithFireEngine(url, 1000);
}
return null; return null;
} }