From e1f52c538fd8852fe977303fc929077b77faf77b Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 7 May 2024 13:40:24 -0300 Subject: [PATCH] nested includeHtml inside pageOptions --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++--- apps/api/src/controllers/crawl.ts | 5 +---- apps/api/src/controllers/crawlPreview.ts | 4 +--- apps/api/src/controllers/scrape.ts | 7 +------ apps/api/src/controllers/search.ts | 7 ++----- apps/api/src/lib/entities.ts | 4 ++-- apps/api/src/main/runWebScraper.ts | 11 +++-------- apps/api/src/scraper/WebScraper/crawler.ts | 4 ---- apps/api/src/scraper/WebScraper/index.ts | 9 +++------ apps/api/src/scraper/WebScraper/single_url.ts | 7 +++---- apps/api/src/types.ts | 2 -- 11 files changed, 19 insertions(+), 47 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index e0f725e..644ad36 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -88,7 +88,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", includeHtml: true }); + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true }}); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data).toHaveProperty("content"); @@ -270,12 +270,12 @@ describe("E2E Tests for API Routes", () => { ); }, 60000); // 60 seconds - it("should return a successful response for a valid crawl job with toMarkdown set to false option", async () => { + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", includeHtml: true }); + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true } }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index d432092..3ba9213 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -35,8 +35,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const includeHtml = req.body.includeHtml || false; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; if (mode === "single_urls" && !url.includes(",")) { try { @@ -48,7 +47,6 @@ export async function crawlController(req: Request, res: Response) { returnOnlyUrls: true, }, pageOptions: pageOptions, - includeHtml: includeHtml, }); const docs = await a.getDocuments(false, (progress) => { @@ -75,7 +73,6 @@ export async function crawlController(req: Request, res: Response) { team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", - includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 2b1b676..d3e9afe 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,8 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const includeHtml = req.body.includeHtml ?? false; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const job = await addWebScraperJob({ url: url, @@ -36,7 +35,6 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id: "preview", pageOptions: pageOptions, origin: "website-preview", - includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 5bd61a5..021a9d0 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -15,7 +15,6 @@ export async function scrapeHelper( crawlerOptions: any, pageOptions: PageOptions, extractorOptions: ExtractorOptions, - includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -41,7 +40,6 @@ export async function scrapeHelper( }, pageOptions: pageOptions, extractorOptions: extractorOptions, - includeHtml: includeHtml }); const docs = await a.getDocuments(false); @@ -93,12 +91,11 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } const origin = req.body.origin ?? "api"; - const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -117,7 +114,6 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions, pageOptions, extractorOptions, - includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -137,7 +133,6 @@ export async function scrapeController(req: Request, res: Response) { origin: origin, extractor_options: extractorOptions, num_tokens: numTokens, - includeHtml: includeHtml }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 314e475..d98c08d 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -14,7 +14,6 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, - includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -60,7 +59,6 @@ export async function searchHelper( await a.setOptions({ mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), - includeHtml, crawlerOptions: { ...crawlerOptions, }, @@ -68,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + includeHtml: pageOptions?.includeHtml ?? false, fallback: false, }, }); @@ -119,6 +118,7 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { + includeHtml: false, onlyMainContent: true, fetchPageContent: true, fallback: false, @@ -126,7 +126,6 @@ export async function searchController(req: Request, res: Response) { const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 7 }; - const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -145,7 +144,6 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, - includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -161,7 +159,6 @@ export async function searchController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, origin: origin, - includeHtml, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index b6340d8..0a6a90e 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,8 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + includeHtml?: boolean; fallback?: boolean; - fetchPageContent?: boolean; + fetchPageContent?: boolean; }; export type ExtractorOptions = { @@ -46,7 +47,6 @@ export type WebScraperOptions = { pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; - includeHtml?: boolean; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 798bb65..189d500 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -26,8 +26,7 @@ export async function startWebScraperPipeline({ onError: (error) => { job.moveToFailed(error); }, - team_id: job.data.team_id, - includeHtml: job.data.includeHtml, + team_id: job.data.team_id })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -39,7 +38,6 @@ export async function runWebScraper({ onSuccess, onError, team_id, - includeHtml = false, }: { url: string; mode: "crawl" | "single_urls" | "sitemap"; @@ -49,7 +47,6 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; - includeHtml?: boolean; }): Promise<{ success: boolean; message: string; @@ -62,16 +59,14 @@ export async function runWebScraper({ mode: mode, urls: [url], crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - includeHtml: includeHtml, + pageOptions: pageOptions }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - includeHtml: includeHtml, + pageOptions: pageOptions }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d3877b3..23cb629 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -19,7 +19,6 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; - private includeHtml: boolean; constructor({ initialUrl, @@ -28,7 +27,6 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, - includeHtml = false, }: { initialUrl: string; includes?: string[]; @@ -36,7 +34,6 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; - includeHtml?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -48,7 +45,6 @@ export class WebCrawler { // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; this.generateImgAltText = generateImgAltText ?? false; - this.includeHtml = includeHtml ?? false; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 2a3916b..ed49f1d 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -24,7 +24,6 @@ export class WebScraperDataProvider { private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; - private includeHtml: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -46,7 +45,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml); + const result = await scrapSingleUrl(url, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -109,7 +108,6 @@ export class WebScraperDataProvider { maxCrawledLinks: this.maxCrawledLinks, limit: this.limit, generateImgAltText: this.generateImgAltText, - includeHtml: this.includeHtml, }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { @@ -144,7 +142,7 @@ export class WebScraperDataProvider { }); return links.map(url => ({ content: "", - html: this.includeHtml ? "" : undefined, + html: this.pageOptions?.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -326,10 +324,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false }; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - this.includeHtml = options?.includeHtml ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4d071db..a67ce31 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - pageOptions: PageOptions = { onlyMainContent: true }, - includeHtml: boolean = false + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, ): Promise { urlToScrap = urlToScrap.trim(); @@ -193,7 +192,7 @@ export async function scrapSingleUrl( url: urlToScrap, content: text, markdown: text, - html: includeHtml ? html : undefined, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -217,7 +216,7 @@ export async function scrapSingleUrl( return { content: text, markdown: text, - html: includeHtml ? html : undefined, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 3fbdcdd..b9b5463 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,7 +25,6 @@ export interface WebScraperOptions { pageOptions: any; team_id: string; origin?: string; - includeHtml?: boolean; } export interface FirecrawlJob { @@ -42,7 +41,6 @@ export interface FirecrawlJob { origin: string; extractor_options?: ExtractorOptions, num_tokens?: number, - includeHtml?: boolean; } export enum RateLimiterMode {