Nick:
This commit is contained in:
parent
fc04d5b033
commit
674500affa
@ -1,4 +1,3 @@
|
|||||||
import Turndown from "turndown";
|
|
||||||
import OpenAI from "openai";
|
import OpenAI from "openai";
|
||||||
import Ajv from "ajv";
|
import Ajv from "ajv";
|
||||||
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||||
|
@ -0,0 +1,44 @@
|
|||||||
|
export async function handleCustomScraping(
|
||||||
|
text: string,
|
||||||
|
url: string
|
||||||
|
): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
|
||||||
|
// Check for Readme Docs special case
|
||||||
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
|
console.log(
|
||||||
|
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
scraper: "fire-engine",
|
||||||
|
url: url,
|
||||||
|
wait_after_load: 1000,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (text.includes('<link href="https://static.vanta.com')) {
|
||||||
|
console.log(
|
||||||
|
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
scraper: "fire-engine",
|
||||||
|
url: url,
|
||||||
|
wait_after_load: 3000,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for Google Drive PDF links in the raw HTML
|
||||||
|
const googleDrivePdfPattern =
|
||||||
|
/https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
|
||||||
|
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
|
||||||
|
if (googleDrivePdfLink) {
|
||||||
|
console.log(
|
||||||
|
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
scraper: "fire-engine",
|
||||||
|
url: url,
|
||||||
|
wait_after_load: 1000,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
@ -7,6 +7,7 @@ import { parseMarkdown } from "../../lib/html-to-markdown";
|
|||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
|
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -253,28 +254,8 @@ function getScrapingFallbackOrder(
|
|||||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleCustomScraping(
|
|
||||||
text: string,
|
|
||||||
url: string
|
|
||||||
): Promise<FireEngineResponse | null> {
|
|
||||||
// Check for Readme Docs special case
|
|
||||||
if (text.includes('<meta name="readme-deploy"')) {
|
|
||||||
console.log(
|
|
||||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
|
||||||
);
|
|
||||||
return await scrapWithFireEngine(url, 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for Google Drive PDF links in the raw HTML
|
|
||||||
const googleDrivePdfPattern = /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
|
|
||||||
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
|
|
||||||
if (googleDrivePdfLink) {
|
|
||||||
console.log(`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`);
|
|
||||||
return await scrapWithFireEngine(url, 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
@ -345,8 +326,15 @@ export async function scrapSingleUrl(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let customScrapedContent : FireEngineResponse | null = null;
|
||||||
|
|
||||||
// Check for custom scraping conditions
|
// Check for custom scraping conditions
|
||||||
const customScrapedContent = await handleCustomScraping(text, url);
|
const customScraperResult = await handleCustomScraping(text, url);
|
||||||
|
|
||||||
|
if(customScraperResult){
|
||||||
|
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
|
||||||
|
}
|
||||||
|
|
||||||
if (customScrapedContent) {
|
if (customScrapedContent) {
|
||||||
text = customScrapedContent.html;
|
text = customScrapedContent.html;
|
||||||
screenshot = customScrapedContent.screenshot;
|
screenshot = customScrapedContent.screenshot;
|
||||||
|
Loading…
Reference in New Issue
Block a user