Merge pull request #200 from mendableai/nsc/wait-for-param
Allow users to manually set the waitFor param on /scrape
This commit is contained in:
commit
7187eaef87
@ -50,6 +50,11 @@
|
|||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||||
"default": false
|
"default": false
|
||||||
|
},
|
||||||
|
"waitFor": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
|
"default": 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -134,6 +134,26 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it("should return a successful response with a valid API key and waitFor option", async () => {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } });
|
||||||
|
const endTime = Date.now();
|
||||||
|
const duration = endTime - startTime;
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("content");
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
|
expect(response.body.data.content).toContain("🔥 Firecrawl");
|
||||||
|
expect(duration).toBeGreaterThanOrEqual(7000);
|
||||||
|
}, 12000); // 12 seconds timeout
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawl", () => {
|
describe("POST /v0/crawl", () => {
|
||||||
|
@ -102,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0 };
|
||||||
const extractorOptions = req.body.extractorOptions ?? {
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,7 @@ export type PageOptions = {
|
|||||||
includeHtml?: boolean;
|
includeHtml?: boolean;
|
||||||
fallback?: boolean;
|
fallback?: boolean;
|
||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
|
waitFor?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -44,18 +44,21 @@ export async function generateRequestParams(
|
|||||||
}
|
}
|
||||||
export async function scrapWithFireEngine(
|
export async function scrapWithFireEngine(
|
||||||
url: string,
|
url: string,
|
||||||
|
waitFor: number = 0,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
const wait_playwright = reqParams["params"]?.wait ?? 0;
|
// If the user has passed a wait parameter in the request, use that
|
||||||
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
|
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`);
|
||||||
|
|
||||||
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
|
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({ url: url, wait: wait_playwright }),
|
body: JSON.stringify({ url: url, wait: waitParam }),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
@ -115,17 +118,18 @@ export async function scrapWithScrapingBee(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapWithPlaywright(url: string): Promise<string> {
|
export async function scrapWithPlaywright(url: string, waitFor: number = 0): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
const wait_playwright = reqParams["params"]?.wait ?? 0;
|
// If the user has passed a wait parameter in the request, use that
|
||||||
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
|
|
||||||
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({ url: url, wait: wait_playwright }),
|
body: JSON.stringify({ url: url, wait: waitParam }),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
@ -178,7 +182,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
||||||
* @returns The order of scrapers to be used for scraping a URL
|
* @returns The order of scrapers to be used for scraping a URL
|
||||||
*/
|
*/
|
||||||
function getScrapingFallbackOrder(defaultScraper?: string) {
|
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false) {
|
||||||
const availableScrapers = baseScrapers.filter(scraper => {
|
const availableScrapers = baseScrapers.filter(scraper => {
|
||||||
switch (scraper) {
|
switch (scraper) {
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
@ -193,16 +197,22 @@ function getScrapingFallbackOrder(defaultScraper?: string) {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
|
let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
|
||||||
|
|
||||||
|
if (isWaitPresent) {
|
||||||
|
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")];
|
||||||
|
}
|
||||||
|
|
||||||
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper));
|
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper));
|
||||||
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]);
|
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]);
|
||||||
const scrapersInOrder = Array.from(uniqueScrapers);
|
const scrapersInOrder = Array.from(uniqueScrapers);
|
||||||
|
console.log(`Scrapers in order: ${scrapersInOrder}`);
|
||||||
return scrapersInOrder as typeof baseScrapers[number][];
|
return scrapersInOrder as typeof baseScrapers[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
|
||||||
existingHtml: string = ""
|
existingHtml: string = ""
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
@ -227,7 +237,9 @@ export async function scrapSingleUrl(
|
|||||||
switch (method) {
|
switch (method) {
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
text = await scrapWithFireEngine(url);
|
console.log(`Scraping ${url} with Fire Engine`);
|
||||||
|
|
||||||
|
text = await scrapWithFireEngine(url, pageOptions.waitFor);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
@ -241,7 +253,7 @@ export async function scrapSingleUrl(
|
|||||||
break;
|
break;
|
||||||
case "playwright":
|
case "playwright":
|
||||||
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||||
text = await scrapWithPlaywright(url);
|
text = await scrapWithPlaywright(url, pageOptions.waitFor);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "scrapingBeeLoad":
|
case "scrapingBeeLoad":
|
||||||
@ -268,7 +280,7 @@ export async function scrapSingleUrl(
|
|||||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||||
}
|
}
|
||||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||||
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper)
|
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0)
|
||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
for (const scraper of scrapersInOrder) {
|
||||||
// If exists text coming from crawler, use it
|
// If exists text coming from crawler, use it
|
||||||
|
Loading…
Reference in New Issue
Block a user