diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index c6c59bc..2e26230 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -81,6 +81,21 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout + + it("should return a successful response with a valid API key and toMarkdown set to false", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("FireCrawl"); + expect(response.body.data.content).toContain(" { @@ -250,6 +265,42 @@ describe("E2E Tests for API Routes", () => { "🔥 FireCrawl" ); }, 60000); // 60 seconds + + it("should return a successful response for a valid crawl job with toMarkdown set to false option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).not.toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "FireCrawl" + ); + expect(completedResponse.body.data[0].content).toContain( + " { diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 3d64f7f..d5877ab 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -35,7 +35,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 569be33..0b4a08c 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true}; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 849500a..e03c013 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions } from './../lib/entities'; +import { ExtractorOptions, PageOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -13,7 +13,7 @@ export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any, + pageOptions: PageOptions, extractorOptions: ExtractorOptions ): Promise<{ success: boolean; @@ -91,7 +91,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 1393922..6529edc 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -66,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + toMarkdown: pageOptions?.toMarkdown ?? true, fallback: false, }, }); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 5b663f2..6150cdd 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,9 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + toMarkdown?: boolean; fallback?: boolean; - fetchPageContent?: boolean; - + fetchPageContent?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1e28552..2cfa84e 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -45,7 +45,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions?.toMarkdown ?? true, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -323,7 +323,7 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false, toMarkdown: true}; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fab54bd..b7fa07a 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -172,7 +172,9 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - + if (toMarkdown === false) { + return [cleanedHtml, text]; + } return [await parseMarkdown(cleanedHtml), text]; }; @@ -192,7 +194,7 @@ export async function scrapSingleUrl( return { url: urlToScrap, content: text, - markdown: text, + markdown: pageOptions.toMarkdown === false ? undefined : text, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -215,14 +217,14 @@ export async function scrapSingleUrl( return { content: text, - markdown: text, + markdown: pageOptions.toMarkdown === false ? undefined : text, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); return { content: "", - markdown: "", + markdown: pageOptions.toMarkdown === false ? undefined : "", metadata: { sourceURL: urlToScrap }, } as Document; }