diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 169c75b..cd2d17a 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -7,7 +7,6 @@ dotenv.config(); // const TEST_URL = 'http://localhost:3002' const TEST_URL = "http://127.0.0.1:3002"; - describe("E2E Tests for API Routes", () => { beforeAll(() => { process.env.USE_DB_AUTHENTICATION = "true"; @@ -56,7 +55,9 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should return a successful response with a valid preview token", async () => { @@ -79,8 +80,29 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout + + it("should return a successful response with a valid API key and includeHtml set to true", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("🔥 FireCrawl"); + expect(response.body.data.markdown).toContain("🔥 FireCrawl"); + expect(response.body.data.html).toContain(" { @@ -106,7 +128,9 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should return a successful response with a valid API key", async () => { @@ -122,15 +146,12 @@ describe("E2E Tests for API Routes", () => { ); }); - // Additional tests for insufficient credits? }); describe("POST /v0/crawlWebsitePreview", () => { it("should require authorization", async () => { - const response = await request(TEST_URL).post( - "/v0/crawlWebsitePreview" - ); + const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); expect(response.statusCode).toBe(401); }); @@ -143,16 +164,17 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); + // it("should return an error for a blocklisted URL", async () => { + // const blocklistedUrl = "https://instagram.com/fake-test"; + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: blocklistedUrl }); + // // is returning 429 instead of 403 + // expect(response.statusCode).toBe(403); + // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + // }); it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) @@ -183,8 +205,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - - it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/search") @@ -246,9 +266,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain( - "🔥 FireCrawl" - ); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); }, 60000); // 60 seconds it("should return a successful response with max depth option for a valid crawl job", async () => { @@ -256,7 +274,52 @@ describe("E2E Tests for API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com", crawlerOptions: { maxDepth: 2 }}); + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url) => { + const depth = new URL(url).pathname.split("/").filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(1); + }); + }, 120000); + + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) @@ -266,13 +329,13 @@ describe("E2E Tests for API Routes", () => { expect(response.body).toHaveProperty("status"); expect(response.body.status).toBe("active"); - // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - + expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); @@ -281,17 +344,14 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map((item: any) => item.metadata?.sourceURL); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have a maximum depth of 1 - urls.forEach((url) => { - const depth = new URL(url).pathname.split('/').filter(Boolean).length; - expect(depth).toBeLessThanOrEqual(1); - }); - - }, 120000); // 120 seconds - }); + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); + expect(completedResponse.body.data[0].html).toContain(" { it("should extract data using LLM extraction mode", async () => { @@ -302,35 +362,33 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://mendable.ai", pageOptions: { - onlyMainContent: true + onlyMainContent: true, }, extractorOptions: { mode: "llm-extraction", - extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", extractionSchema: { type: "object", properties: { company_mission: { - type: "string" + type: "string", }, supports_sso: { - type: "boolean" + type: "boolean", }, is_open_source: { - type: "boolean" - } + type: "boolean", + }, }, - required: ["company_mission", "supports_sso", "is_open_source"] - } - } + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, }); - // Ensure that the job was successfully created before proceeding with LLM extraction expect(response.statusCode).toBe(200); - - // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` let llmExtraction = response.body.data.llm_extraction; @@ -383,7 +441,6 @@ describe("E2E Tests for API Routes", () => { // } // }); - // // Print the response body to the console for debugging purposes // console.log("Response companies:", response.body.data.llm_extraction.companies); @@ -405,9 +462,6 @@ describe("E2E Tests for API Routes", () => { // }, 120000); // 120 secs // }); - - - describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 3d64f7f..3ba9213 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -35,7 +35,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 569be33..d3e9afe 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 849500a..021a9d0 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions } from './../lib/entities'; +import { ExtractorOptions, PageOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -13,8 +13,8 @@ export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any, - extractorOptions: ExtractorOptions + pageOptions: PageOptions, + extractorOptions: ExtractorOptions, ): Promise<{ success: boolean; error?: string; @@ -39,7 +39,7 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, - extractorOptions: extractorOptions + extractorOptions: extractorOptions, }); const docs = await a.getDocuments(false); @@ -91,7 +91,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } @@ -113,7 +113,7 @@ export async function scrapeController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - extractorOptions + extractorOptions, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -132,7 +132,7 @@ export async function scrapeController(req: Request, res: Response) { pageOptions: pageOptions, origin: origin, extractor_options: extractorOptions, - num_tokens: numTokens + num_tokens: numTokens, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 1393922..d98c08d 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -13,7 +13,7 @@ export async function searchHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - searchOptions: SearchOptions + searchOptions: SearchOptions, ): Promise<{ success: boolean; error?: string; @@ -66,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + includeHtml: pageOptions?.includeHtml ?? false, fallback: false, }, }); @@ -117,6 +118,7 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { + includeHtml: false, onlyMainContent: true, fetchPageContent: true, fallback: false, @@ -141,7 +143,7 @@ export async function searchController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - searchOptions + searchOptions, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 6a58de5..30bdfae 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,9 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + includeHtml?: boolean; fallback?: boolean; fetchPageContent?: boolean; - }; export type ExtractorOptions = { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 827eec5..189d500 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -26,7 +26,7 @@ export async function startWebScraperPipeline({ onError: (error) => { job.moveToFailed(error); }, - team_id: job.data.team_id, + team_id: job.data.team_id })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -59,14 +59,14 @@ export async function runWebScraper({ mode: mode, urls: [url], crawlerOptions: crawlerOptions, - pageOptions: pageOptions, + pageOptions: pageOptions }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, - pageOptions: pageOptions, + pageOptions: pageOptions }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 38ff47b..d42d27b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -46,7 +46,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -144,6 +144,7 @@ export class WebScraperDataProvider { }); return links.map(url => ({ content: "", + html: this.pageOptions?.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -327,12 +328,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - - console.log("maxDepth:", this.maxCrawledDepth, options.crawlerOptions?.maxDepth); - //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fab54bd..a67ce31 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - toMarkdown: boolean = true, - pageOptions: PageOptions = { onlyMainContent: true } + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, ): Promise { urlToScrap = urlToScrap.trim(); @@ -172,7 +171,7 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - + return [await parseMarkdown(cleanedHtml), text]; }; @@ -193,6 +192,7 @@ export async function scrapSingleUrl( url: urlToScrap, content: text, markdown: text, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -216,6 +216,7 @@ export async function scrapSingleUrl( return { content: text, markdown: text, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { @@ -223,6 +224,7 @@ export async function scrapSingleUrl( return { content: "", markdown: "", + html: "", metadata: { sourceURL: urlToScrap }, } as Document; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c1858f1..b9b5463 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -40,7 +40,7 @@ export interface FirecrawlJob { pageOptions?: any; origin: string; extractor_options?: ExtractorOptions, - num_tokens?: number + num_tokens?: number, } export enum RateLimiterMode {