diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 3044938..8403fd7 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -54,6 +54,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageError).toBeUndefined(); }, 30000); // 30 seconds timeout + it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { const response: FirecrawlScrapeResponse = await request(TEST_URL) .post("/v0/scrape") diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index db8f0ae..e112cd4 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; +import { removeUnwantedElements } from "./utils/removeUnwantedElements"; import axios from "axios"; dotenv.config(); @@ -313,31 +313,6 @@ export async function scrapSingleUrl( ): Promise { urlToScrap = urlToScrap.trim(); - const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { - const soup = cheerio.load(html); - soup("script, style, iframe, noscript, meta, head").remove(); - - if (pageOptions.removeTags) { - if (typeof pageOptions.removeTags === 'string') { - pageOptions.removeTags.split(',').forEach((tag) => { - soup(tag.trim()).remove(); - }); - } else if (Array.isArray(pageOptions.removeTags)) { - pageOptions.removeTags.forEach((tag) => { - soup(tag).remove(); - }); - } - } - - if (pageOptions.onlyMainContent) { - // remove any other tags that are not in the main content - excludeNonMainTags.forEach((tag) => { - soup(tag).remove(); - }); - } - return soup.html(); - }; - const attemptScraping = async ( url: string, method: (typeof baseScrapers)[number] diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts new file mode 100644 index 0000000..0dc24c8 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts @@ -0,0 +1,103 @@ +import { removeUnwantedElements } from "../removeUnwantedElements"; +import { PageOptions } from "../../../../lib/entities"; + +describe('removeUnwantedElements', () => { + it('should remove script, style, iframe, noscript, meta, and head tags', () => { + const html = `Test
Content
`; + const options: PageOptions = {}; + const result = removeUnwantedElements(html, options); + expect(result).not.toContain('