diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 3e82fb4..215e1d1 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -79,8 +79,26 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout + + it("should return a successful response with a valid API key and includeHtml set to true", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true }}); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("🔥 FireCrawl"); + expect(response.body.data.markdown).toContain("🔥 FireCrawl"); + expect(response.body.data.html).toContain(" { @@ -143,16 +161,17 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); + // it("should return an error for a blocklisted URL", async () => { + // const blocklistedUrl = "https://instagram.com/fake-test"; + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: blocklistedUrl }); + // // is returning 429 instead of 403 + // expect(response.statusCode).toBe(403); + // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + // }); it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) @@ -250,6 +269,46 @@ describe("E2E Tests for API Routes", () => { "🔥 FireCrawl" ); }, 60000); // 60 seconds + + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true } }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + expect(completedResponse.body.data[0].markdown).toContain( + "FireCrawl" + ); + expect(completedResponse.body.data[0].html).toContain( + " { diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 8b5249b..e53faed 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -41,7 +41,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 569be33..d3e9afe 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 849500a..021a9d0 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions } from './../lib/entities'; +import { ExtractorOptions, PageOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -13,8 +13,8 @@ export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any, - extractorOptions: ExtractorOptions + pageOptions: PageOptions, + extractorOptions: ExtractorOptions, ): Promise<{ success: boolean; error?: string; @@ -39,7 +39,7 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, - extractorOptions: extractorOptions + extractorOptions: extractorOptions, }); const docs = await a.getDocuments(false); @@ -91,7 +91,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } @@ -113,7 +113,7 @@ export async function scrapeController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - extractorOptions + extractorOptions, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -132,7 +132,7 @@ export async function scrapeController(req: Request, res: Response) { pageOptions: pageOptions, origin: origin, extractor_options: extractorOptions, - num_tokens: numTokens + num_tokens: numTokens, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 1393922..d98c08d 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -13,7 +13,7 @@ export async function searchHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - searchOptions: SearchOptions + searchOptions: SearchOptions, ): Promise<{ success: boolean; error?: string; @@ -66,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + includeHtml: pageOptions?.includeHtml ?? false, fallback: false, }, }); @@ -117,6 +118,7 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { + includeHtml: false, onlyMainContent: true, fetchPageContent: true, fallback: false, @@ -141,7 +143,7 @@ export async function searchController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - searchOptions + searchOptions, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 1bb9429..7cf48cb 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,9 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + includeHtml?: boolean; fallback?: boolean; fetchPageContent?: boolean; - }; export type ExtractorOptions = { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 252f2e4..3c9ea88 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -27,7 +27,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, - bull_job_id: job.id.toString(), + bull_job_id: job.id.toString() })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -63,14 +63,14 @@ export async function runWebScraper({ urls: [url], crawlerOptions: crawlerOptions, pageOptions: pageOptions, - bullJobId: bull_job_id, + bullJobId: bull_job_id }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, - pageOptions: pageOptions, + pageOptions: pageOptions }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 96112f8..c127433 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -55,7 +55,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -177,6 +177,7 @@ export class WebScraperDataProvider { }); return links.map((url) => ({ content: "", + html: this.pageOptions?.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -387,11 +388,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false }; - this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; - this.replaceAllPathsWithAbsolutePaths = - options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fab54bd..a67ce31 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - toMarkdown: boolean = true, - pageOptions: PageOptions = { onlyMainContent: true } + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, ): Promise { urlToScrap = urlToScrap.trim(); @@ -172,7 +171,7 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - + return [await parseMarkdown(cleanedHtml), text]; }; @@ -193,6 +192,7 @@ export async function scrapSingleUrl( url: urlToScrap, content: text, markdown: text, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -216,6 +216,7 @@ export async function scrapSingleUrl( return { content: text, markdown: text, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { @@ -223,6 +224,7 @@ export async function scrapSingleUrl( return { content: "", markdown: "", + html: "", metadata: { sourceURL: urlToScrap }, } as Document; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c1858f1..b9b5463 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -40,7 +40,7 @@ export interface FirecrawlJob { pageOptions?: any; origin: string; extractor_options?: ExtractorOptions, - num_tokens?: number + num_tokens?: number, } export enum RateLimiterMode {