From 676d6e8ab5f7a1fd14ff5b76f8289db7543082c4 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Thu, 13 Jun 2024 10:51:05 -0300
Subject: [PATCH] Added pageOptions.removeTags

---
 apps/api/openapi.json                         | 19 +++++++++++
 .../src/__tests__/e2e_withAuth/index.test.ts  | 34 +++++++++++++++++++
 apps/api/src/controllers/crawl.ts             | 10 ++++--
 apps/api/src/controllers/crawlPreview.ts      |  2 +-
 apps/api/src/controllers/search.ts            |  2 ++
 apps/api/src/lib/entities.ts                  |  1 +
 apps/api/src/scraper/WebScraper/index.ts      |  7 +++-
 apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++
 8 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/apps/api/openapi.json b/apps/api/openapi.json
index a755e37..b07e43f 100644
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@@ -61,6 +61,13 @@
                         "description": "Wait x amount of milliseconds for the page to load to fetch content",
                         "default": 0
                       },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                       "headers": {
                         "type": "object",
                         "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
@@ -194,6 +201,11 @@
                         "type": "integer",
                         "description": "Maximum number of pages to crawl",
                         "default": 10000
+                      },
+                      "allowBackwardCrawling": {
+                        "type": "boolean",
+                        "description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
+                        "default": false
                       }
                     }
                   },
@@ -219,6 +231,13 @@
                         "type": "object",
                         "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
                       },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                       "replaceAllPathsWithAbsolutePaths": {
                         "type": "boolean",
                         "description": "Replace all relative paths with absolute paths for images and links",
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 02e4a47..3423b3a 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -136,6 +136,40 @@ describe("E2E Tests for API Routes", () => {
       expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
     }, 60000); // 60 seconds
 
+    it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
+      const responseWithoutRemoveTags = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/" });
+      expect(responseWithoutRemoveTags.statusCode).toBe(200);
+      expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+      expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
+      expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
+      expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
+
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("content");
+      expect(response.body.data).toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("metadata");
+      expect(response.body.data).not.toHaveProperty("html");
+      expect(response.body.data.content).toContain("Scrape This Site");
+      expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+      expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+      expect(response.body.data.content).not.toContain("web scraping"); // strong
+    }, 30000); // 30 seconds timeout
+
     // TODO: add this test back once we nail the waitFor option to be more deterministic
     // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
     //   const startTime = Date.now();
diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts
index 58d01e2..7eab78f 100644
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) {
     }
 
     const mode = req.body.mode ?? "crawl";
-    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const crawlerOptions = req.body.crawlerOptions ?? {
+      allowBackwardCrawling: false
+    };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      removeTags: []
+    };
 
     if (mode === "single_urls" && !url.includes(",")) {
       try {
diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts
index d3e9afe..2c3dc4e 100644
--- a/apps/api/src/controllers/crawlPreview.ts
+++ b/apps/api/src/controllers/crawlPreview.ts
@@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
 
     const mode = req.body.mode ?? "crawl";
     const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
 
     const job = await addWebScraperJob({
       url: url,
diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts
index 7474aae..abbc357 100644
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@@ -85,6 +85,7 @@ export async function searchHelper(
       onlyMainContent: pageOptions?.onlyMainContent ?? true,
       fetchPageContent: pageOptions?.fetchPageContent ?? true,
       includeHtml: pageOptions?.includeHtml ?? false,
+      removeTags: pageOptions?.removeTags ?? [],
       fallback: false,
     },
   });
@@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
       includeHtml: false,
       onlyMainContent: true,
       fetchPageContent: true,
+      removeTags: [],
       fallback: false,
     };
     const origin = req.body.origin ?? "api";
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 81bf12c..92170c1 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -19,6 +19,7 @@ export type PageOptions = {
   screenshot?: boolean;
   headers?: Record<string, string>;
   replaceAllPathsWithAbsolutePaths?: boolean;
+  removeTags?: string | string[];
 };
 
 export type ExtractorOptions = {
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index f432f43..1a6ffd0 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -475,7 +475,12 @@ export class WebScraperDataProvider {
     this.limit = options.crawlerOptions?.limit ?? 10000;
     this.generateImgAltText =
       options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
+    this.pageOptions = options.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      replaceAllPathsWithAbsolutePaths: false,
+      removeTags: []
+    };
     this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
     this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
     //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c2dcea1..a16f6f0 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -304,6 +304,19 @@ export async function scrapSingleUrl(
   const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
     const soup = cheerio.load(html);
     soup("script, style, iframe, noscript, meta, head").remove();
+
+    if (pageOptions.removeTags) {
+      if (typeof pageOptions.removeTags === 'string') {
+        pageOptions.removeTags.split(',').forEach((tag) => {
+          soup(tag.trim()).remove();
+        });
+      } else if (Array.isArray(pageOptions.removeTags)) {
+        pageOptions.removeTags.forEach((tag) => {
+          soup(tag).remove();
+        });
+      }
+    }
+    
     if (pageOptions.onlyMainContent) {
       // remove any other tags that are not in the main content
       excludeNonMainTags.forEach((tag) => {