From 509250c4ef6fe41d60f6d5ad8ed2a8a6495c6bf2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 6 May 2024 19:45:56 -0300 Subject: [PATCH] changed to `includeHtml` --- .../src/__tests__/e2e_withAuth/index.test.ts | 44 +++++++++++-------- apps/api/src/controllers/crawl.ts | 5 ++- apps/api/src/controllers/crawlPreview.ts | 4 +- apps/api/src/controllers/scrape.ts | 15 ++++--- apps/api/src/controllers/search.ts | 10 +++-- apps/api/src/lib/entities.ts | 2 +- apps/api/src/main/runWebScraper.ts | 5 +++ apps/api/src/scraper/WebScraper/crawler.ts | 4 ++ apps/api/src/scraper/WebScraper/index.ts | 9 ++-- apps/api/src/scraper/WebScraper/single_url.ts | 17 +++---- apps/api/src/types.ts | 4 +- 11 files changed, 78 insertions(+), 41 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2e26230..e0f725e 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -79,22 +79,25 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout - it("should return a successful response with a valid API key and toMarkdown set to false", async () => { + it("should return a successful response with a valid API key and includeHtml set to true", async () => { const response = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + .send({ url: "https://firecrawl.dev", includeHtml: true }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("FireCrawl"); - expect(response.body.data.content).toContain(" { expect(response.statusCode).toBe(401); }); - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); + // it("should return an error for a blocklisted URL", async () => { + // const blocklistedUrl = "https://instagram.com/fake-test"; + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: blocklistedUrl }); + // // is returning 429 instead of 403 + // expect(response.statusCode).toBe(403); + // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + // }); it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) @@ -271,7 +275,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + .send({ url: "https://firecrawl.dev", includeHtml: true }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) @@ -292,12 +296,16 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).not.toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + expect(completedResponse.body.data[0].markdown).toContain( "FireCrawl" ); - expect(completedResponse.body.data[0].content).toContain( + expect(completedResponse.body.data[0].html).toContain( " { @@ -73,6 +75,7 @@ export async function crawlController(req: Request, res: Response) { team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", + includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 0b4a08c..2b1b676 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,8 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const includeHtml = req.body.includeHtml ?? false; const job = await addWebScraperJob({ url: url, @@ -35,6 +36,7 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id: "preview", pageOptions: pageOptions, origin: "website-preview", + includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index e03c013..5bd61a5 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -14,7 +14,8 @@ export async function scrapeHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - extractorOptions: ExtractorOptions + extractorOptions: ExtractorOptions, + includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -39,7 +40,8 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, - extractorOptions: extractorOptions + extractorOptions: extractorOptions, + includeHtml: includeHtml }); const docs = await a.getDocuments(false); @@ -91,11 +93,12 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } const origin = req.body.origin ?? "api"; + const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -113,7 +116,8 @@ export async function scrapeController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - extractorOptions + extractorOptions, + includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -132,7 +136,8 @@ export async function scrapeController(req: Request, res: Response) { pageOptions: pageOptions, origin: origin, extractor_options: extractorOptions, - num_tokens: numTokens + num_tokens: numTokens, + includeHtml: includeHtml }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 6529edc..314e475 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -13,7 +13,8 @@ export async function searchHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - searchOptions: SearchOptions + searchOptions: SearchOptions, + includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -59,6 +60,7 @@ export async function searchHelper( await a.setOptions({ mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), + includeHtml, crawlerOptions: { ...crawlerOptions, }, @@ -66,7 +68,6 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, - toMarkdown: pageOptions?.toMarkdown ?? true, fallback: false, }, }); @@ -125,6 +126,7 @@ export async function searchController(req: Request, res: Response) { const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 7 }; + const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -142,7 +144,8 @@ export async function searchController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - searchOptions + searchOptions, + includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -158,6 +161,7 @@ export async function searchController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, origin: origin, + includeHtml, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 6150cdd..b6340d8 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,7 +12,6 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; - toMarkdown?: boolean; fallback?: boolean; fetchPageContent?: boolean; }; @@ -47,6 +46,7 @@ export type WebScraperOptions = { pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; + includeHtml?: boolean; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 827eec5..798bb65 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -27,6 +27,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, + includeHtml: job.data.includeHtml, })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -38,6 +39,7 @@ export async function runWebScraper({ onSuccess, onError, team_id, + includeHtml = false, }: { url: string; mode: "crawl" | "single_urls" | "sitemap"; @@ -47,6 +49,7 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; + includeHtml?: boolean; }): Promise<{ success: boolean; message: string; @@ -60,6 +63,7 @@ export async function runWebScraper({ urls: [url], crawlerOptions: crawlerOptions, pageOptions: pageOptions, + includeHtml: includeHtml, }); } else { await provider.setOptions({ @@ -67,6 +71,7 @@ export async function runWebScraper({ urls: url.split(","), crawlerOptions: crawlerOptions, pageOptions: pageOptions, + includeHtml: includeHtml, }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 23cb629..d3877b3 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -19,6 +19,7 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; + private includeHtml: boolean; constructor({ initialUrl, @@ -27,6 +28,7 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, + includeHtml = false, }: { initialUrl: string; includes?: string[]; @@ -34,6 +36,7 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + includeHtml?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -45,6 +48,7 @@ export class WebCrawler { // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; this.generateImgAltText = generateImgAltText ?? false; + this.includeHtml = includeHtml ?? false; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 2cfa84e..2a3916b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -24,6 +24,7 @@ export class WebScraperDataProvider { private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; + private includeHtml: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -45,7 +46,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions?.toMarkdown ?? true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml); processedUrls++; if (inProgress) { inProgress({ @@ -108,6 +109,7 @@ export class WebScraperDataProvider { maxCrawledLinks: this.maxCrawledLinks, limit: this.limit, generateImgAltText: this.generateImgAltText, + includeHtml: this.includeHtml, }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { @@ -142,6 +144,7 @@ export class WebScraperDataProvider { }); return links.map(url => ({ content: "", + html: this.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -323,10 +326,10 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false, toMarkdown: true}; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - + this.includeHtml = options?.includeHtml ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b7fa07a..4d071db 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,8 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - toMarkdown: boolean = true, - pageOptions: PageOptions = { onlyMainContent: true } + pageOptions: PageOptions = { onlyMainContent: true }, + includeHtml: boolean = false ): Promise { urlToScrap = urlToScrap.trim(); @@ -172,9 +172,7 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - if (toMarkdown === false) { - return [cleanedHtml, text]; - } + return [await parseMarkdown(cleanedHtml), text]; }; @@ -194,7 +192,8 @@ export async function scrapSingleUrl( return { url: urlToScrap, content: text, - markdown: pageOptions.toMarkdown === false ? undefined : text, + markdown: text, + html: includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -217,14 +216,16 @@ export async function scrapSingleUrl( return { content: text, - markdown: pageOptions.toMarkdown === false ? undefined : text, + markdown: text, + html: includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); return { content: "", - markdown: pageOptions.toMarkdown === false ? undefined : "", + markdown: "", + html: "", metadata: { sourceURL: urlToScrap }, } as Document; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c1858f1..3fbdcdd 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,6 +25,7 @@ export interface WebScraperOptions { pageOptions: any; team_id: string; origin?: string; + includeHtml?: boolean; } export interface FirecrawlJob { @@ -40,7 +41,8 @@ export interface FirecrawlJob { pageOptions?: any; origin: string; extractor_options?: ExtractorOptions, - num_tokens?: number + num_tokens?: number, + includeHtml?: boolean; } export enum RateLimiterMode {