From 99f2ffd6d591398a4baef347306d25371b381793 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 17:03:10 -0700 Subject: [PATCH 1/8] Update webhook.ts --- apps/api/src/services/webhook.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index eca7d09..1f8d647 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -3,15 +3,17 @@ import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, data: any) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; let webhookUrl = selfHostedUrl; - if (!selfHostedUrl) { + // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set + // and the USE_DB_AUTHENTICATION environment variable is set to true + if (!selfHostedUrl && useDbAuthentication) { const { data: webhooksData, error } = await supabase_service .from("webhooks") .select("url") .eq("team_id", teamId) .limit(1); - if (error) { console.error( `Error fetching webhook URL for team ID: ${teamId}`, From a9f93c2f1e9d02303b24dd49862602d0fd5828dd Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:18:05 -0300 Subject: [PATCH 2/8] Added route to clean completed jobs and a github action cron that triggers every 24h --- .../clean-before-24h-complete-jobs.yml | 17 +++++++++++++++ apps/api/src/index.ts | 21 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 .github/workflows/clean-before-24h-complete-jobs.yml diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml new file mode 100644 index 0000000..2fd3b22 --- /dev/null +++ b/.github/workflows/clean-before-24h-complete-jobs.yml @@ -0,0 +1,17 @@ +name: Clean Before 24h Completed Jobs +on: + schedule: + - cron: '0 0 * * *' + +jobs: + clean-jobs: + runs-on: ubuntu-latest + steps: + - name: Send GET request to clean jobs + run: | + response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/clean-before-24h-complete-jobs) + if [ "$response" -ne 200 ]; then + echo "Failed to clean jobs. Response: $response" + exit 1 + fi + echo "Successfully cleaned jobs. Response: $response" diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0246a1e..eac8204 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -164,6 +164,27 @@ app.get('/serverHealthCheck/notify', async (req, res) => { } }); +app.get('/clean-before-24h-complete-jobs', async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const completedJobs = await webScraperQueue.getJobs(['completed']); + const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000); + const jobIds = before24hJobs.map(job => job.id) as string[]; + let count = 0; + for (const jobId of jobIds) { + try { + await webScraperQueue.removeJobs(jobId); + count++; + } catch (jobError) { + console.error(`Failed to remove job with ID ${jobId}:`, jobError); + } + } + res.status(200).send(`Removed ${count} completed jobs.`); + } catch (error) { + console.error('Failed to clean last 24h complete jobs:', error); + res.status(500).send('Failed to clean jobs'); + } +}); app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); From b87725c683fff5ac4bdaeb6464a6b6dd1755e3b7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:08:49 -0700 Subject: [PATCH 3/8] Update openapi.json --- apps/api/openapi.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 55bfe1c..7147af1 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -190,6 +190,11 @@ "description": "Ignore the website sitemap when crawling", "default": false }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false + }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", From 520739c9f44b77d94288f3ea9e0433330ae1bc12 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:43:16 -0700 Subject: [PATCH 4/8] Nick: fixed bugs associated with absolute path replacements --- apps/api/openapi.json | 10 +++++----- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 11 +++++----- .../scraper/WebScraper/utils/replacePaths.ts | 20 +++++++++++-------- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 7147af1..a755e37 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -190,11 +190,6 @@ "description": "Ignore the website sitemap when crawling", "default": false }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false - }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", @@ -223,6 +218,11 @@ "headers": { "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false } } } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 744c07b..d5002c7 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { waitFor?: number; screenshot?: boolean; headers?: Record; + replaceAllPathsWithAbsolutePaths?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7dcd175..54897f1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -302,9 +302,10 @@ export class WebScraperDataProvider { } private applyPathReplacements(documents: Document[]): Document[] { - return this.replaceAllPathsWithAbsolutePaths - ? replacePathsWithAbsolutePaths(documents) - : replaceImgPathsWithAbsolutePaths(documents); + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } + return replaceImgPathsWithAbsolutePaths(documents); } private async applyImgAltText(documents: Document[]): Promise { @@ -473,9 +474,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index d652611..788916c 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] ) || []; paths.forEach((path: string) => { - const isImage = path.startsWith("!"); + try { + const isImage = path.startsWith("!"); let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); let url = matchedUrl[1]; @@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] } const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; - if (isImage) { - document.content = document.content.replace( - path, - `${markdownLinkOrImageText}(${url})` - ); - } else { + // Image is handled afterwards + if (!isImage) { document.content = document.content.replace( path, `${markdownLinkOrImageText}(${url})` ); + } + } catch (error) { + } }); + document.markdown = document.content; }); return documents; @@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen if (!imageUrl.startsWith("http")) { if (imageUrl.startsWith("/")) { imageUrl = imageUrl.substring(1); + imageUrl = new URL(imageUrl, baseUrl).toString(); + } else { + imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString(); } - imageUrl = new URL(imageUrl, baseUrl).toString(); } } @@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen `![${altText}](${imageUrl})` ); }); + document.markdown = document.content; }); return documents; From 2239e03269ec8ef3c3dba2596ac8994fa4562b05 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:54:02 -0700 Subject: [PATCH 5/8] Update replacePaths.test.ts --- .../WebScraper/utils/__tests__/replacePaths.test.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index aae567c..6ecd990 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,12 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' + content: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +21,7 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +41,12 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); From 1e3e06a1d57bffdafb7f562ca9fd5a4cb15ad05f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 13:02:39 -0700 Subject: [PATCH 6/8] Update replacePaths.test.ts --- .../utils/__tests__/replacePaths.test.ts | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index 6ecd990..e201926 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,14 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource).' + content: 'This is a [link](/path/to/resource).', + markdown: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource).' + content: 'This is a [link](https://example.com/path/to/resource).', + markdown: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +23,8 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path).' + content: 'This is an [external link](https://external.com/path).', + markdown: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -31,7 +34,8 @@ describe('replacePaths', () => { it('should not alter data URLs for images', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' + content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +45,14 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).', + markdown: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).', + markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -56,12 +62,14 @@ describe('replacePaths', () => { it('should correctly handle a mix of absolute and relative paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -74,12 +82,14 @@ describe('replacePaths', () => { it('should replace relative image paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](/path/to/image.jpg).' + content: 'Here is an image: ![alt text](/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](/path/to/image.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' + content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -89,7 +99,8 @@ describe('replacePaths', () => { it('should not alter data:image URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).' + content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -99,12 +110,14 @@ describe('replacePaths', () => { it('should handle multiple images with a mix of data and relative URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' + content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).', + markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' + content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).', + markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); From 157fbe4a1ea67e4807426696b5f9b3de446641c8 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 11 Jun 2024 17:52:01 -0300 Subject: [PATCH 7/8] added bull auth key --- .github/workflows/clean-before-24h-complete-jobs.yml | 5 ++++- apps/api/src/index.ts | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml index 2fd3b22..2ced537 100644 --- a/.github/workflows/clean-before-24h-complete-jobs.yml +++ b/.github/workflows/clean-before-24h-complete-jobs.yml @@ -3,13 +3,16 @@ on: schedule: - cron: '0 0 * * *' +env: + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + jobs: clean-jobs: runs-on: ubuntu-latest steps: - name: Send GET request to clean jobs run: | - response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/clean-before-24h-complete-jobs) + response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs) if [ "$response" -ne 200 ]; then echo "Failed to clean jobs. Response: $response" exit 1 diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index eac8204..cc8376b 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -164,7 +164,7 @@ app.get('/serverHealthCheck/notify', async (req, res) => { } }); -app.get('/clean-before-24h-complete-jobs', async (req, res) => { +app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => { try { const webScraperQueue = getWebScraperQueue(); const completedJobs = await webScraperQueue.getJobs(['completed']); From d4df6f049d842c975fc5df15e24fd80fb031f322 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 15:49:30 -0700 Subject: [PATCH 8/8] Nick: --- .github/{workflows => archive}/js-sdk.yml | 0 .github/{workflows => archive}/publish-js-sdk.yml | 0 .github/{workflows => archive}/publish-python-sdk.yml | 0 .github/{workflows => archive}/python-sdk.yml | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename .github/{workflows => archive}/js-sdk.yml (100%) rename .github/{workflows => archive}/publish-js-sdk.yml (100%) rename .github/{workflows => archive}/publish-python-sdk.yml (100%) rename .github/{workflows => archive}/python-sdk.yml (100%) diff --git a/.github/workflows/js-sdk.yml b/.github/archive/js-sdk.yml similarity index 100% rename from .github/workflows/js-sdk.yml rename to .github/archive/js-sdk.yml diff --git a/.github/workflows/publish-js-sdk.yml b/.github/archive/publish-js-sdk.yml similarity index 100% rename from .github/workflows/publish-js-sdk.yml rename to .github/archive/publish-js-sdk.yml diff --git a/.github/workflows/publish-python-sdk.yml b/.github/archive/publish-python-sdk.yml similarity index 100% rename from .github/workflows/publish-python-sdk.yml rename to .github/archive/publish-python-sdk.yml diff --git a/.github/workflows/python-sdk.yml b/.github/archive/python-sdk.yml similarity index 100% rename from .github/workflows/python-sdk.yml rename to .github/archive/python-sdk.yml