0

Merge pull request #200 from mendableai/nsc/wait-for-param

Allow users to manually set the waitFor param on /scrape
This commit is contained in:
Nicolas 2024-05-28 16:59:26 -07:00 committed by GitHub
commit 7187eaef87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 50 additions and 12 deletions

View File

@ -50,6 +50,11 @@
"type": "boolean", "type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.", "description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false "default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
} }
} }
}, },

View File

@ -134,6 +134,26 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds }, 60000); // 60 seconds
it("should return a successful response with a valid API key and waitFor option", async () => {
const startTime = Date.now();
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } });
const endTime = Date.now();
const duration = endTime - startTime;
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.content).toContain("🔥 Firecrawl");
expect(duration).toBeGreaterThanOrEqual(7000);
}, 12000); // 12 seconds timeout
}); });
describe("POST /v0/crawl", () => { describe("POST /v0/crawl", () => {

View File

@ -102,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0 };
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }

View File

@ -15,6 +15,7 @@ export type PageOptions = {
includeHtml?: boolean; includeHtml?: boolean;
fallback?: boolean; fallback?: boolean;
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number;
}; };
export type ExtractorOptions = { export type ExtractorOptions = {

View File

@ -44,18 +44,21 @@ export async function generateRequestParams(
} }
export async function scrapWithFireEngine( export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0,
options?: any options?: any
): Promise<string> { ): Promise<string> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const wait_playwright = reqParams["params"]?.wait ?? 0; // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`);
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ url: url, wait: wait_playwright }), body: JSON.stringify({ url: url, wait: waitParam }),
}); });
if (!response.ok) { if (!response.ok) {
@ -115,17 +118,18 @@ export async function scrapWithScrapingBee(
} }
} }
export async function scrapWithPlaywright(url: string): Promise<string> { export async function scrapWithPlaywright(url: string, waitFor: number = 0): Promise<string> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const wait_playwright = reqParams["params"]?.wait ?? 0; // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ url: url, wait: wait_playwright }), body: JSON.stringify({ url: url, wait: waitParam }),
}); });
if (!response.ok) { if (!response.ok) {
@ -178,7 +182,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL * @returns The order of scrapers to be used for scraping a URL
*/ */
function getScrapingFallbackOrder(defaultScraper?: string) { function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false) {
const availableScrapers = baseScrapers.filter(scraper => { const availableScrapers = baseScrapers.filter(scraper => {
switch (scraper) { switch (scraper) {
case "scrapingBee": case "scrapingBee":
@ -193,16 +197,22 @@ function getScrapingFallbackOrder(defaultScraper?: string) {
} }
}); });
const defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
if (isWaitPresent) {
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")];
}
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper)); const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper));
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]); const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]);
const scrapersInOrder = Array.from(uniqueScrapers); const scrapersInOrder = Array.from(uniqueScrapers);
console.log(`Scrapers in order: ${scrapersInOrder}`);
return scrapersInOrder as typeof baseScrapers[number][]; return scrapersInOrder as typeof baseScrapers[number][];
} }
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
existingHtml: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -227,7 +237,9 @@ export async function scrapSingleUrl(
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
text = await scrapWithFireEngine(url); console.log(`Scraping ${url} with Fire Engine`);
text = await scrapWithFireEngine(url, pageOptions.waitFor);
} }
break; break;
case "scrapingBee": case "scrapingBee":
@ -241,7 +253,7 @@ export async function scrapSingleUrl(
break; break;
case "playwright": case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
text = await scrapWithPlaywright(url); text = await scrapWithPlaywright(url, pageOptions.waitFor);
} }
break; break;
case "scrapingBeeLoad": case "scrapingBeeLoad":
@ -268,7 +280,7 @@ export async function scrapSingleUrl(
console.error(`Invalid URL key, trying: ${urlToScrap}`); console.error(`Invalid URL key, trying: ${urlToScrap}`);
} }
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper) const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0)
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it // If exists text coming from crawler, use it