diff --git a/.github/workflows/js-sdk.yml b/.github/archive/js-sdk.yml similarity index 100% rename from .github/workflows/js-sdk.yml rename to .github/archive/js-sdk.yml diff --git a/.github/workflows/publish-js-sdk.yml b/.github/archive/publish-js-sdk.yml similarity index 100% rename from .github/workflows/publish-js-sdk.yml rename to .github/archive/publish-js-sdk.yml diff --git a/.github/workflows/publish-python-sdk.yml b/.github/archive/publish-python-sdk.yml similarity index 100% rename from .github/workflows/publish-python-sdk.yml rename to .github/archive/publish-python-sdk.yml diff --git a/.github/workflows/python-sdk.yml b/.github/archive/python-sdk.yml similarity index 100% rename from .github/workflows/python-sdk.yml rename to .github/archive/python-sdk.yml diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml new file mode 100644 index 0000000..2ced537 --- /dev/null +++ b/.github/workflows/clean-before-24h-complete-jobs.yml @@ -0,0 +1,20 @@ +name: Clean Before 24h Completed Jobs +on: + schedule: + - cron: '0 0 * * *' + +env: + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + +jobs: + clean-jobs: + runs-on: ubuntu-latest + steps: + - name: Send GET request to clean jobs + run: | + response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs) + if [ "$response" -ne 200 ]; then + echo "Failed to clean jobs. Response: $response" + exit 1 + fi + echo "Successfully cleaned jobs. Response: $response" diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 55bfe1c..a755e37 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -218,6 +218,11 @@ "headers": { "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false } } } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0246a1e..cc8376b 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -164,6 +164,27 @@ app.get('/serverHealthCheck/notify', async (req, res) => { } }); +app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const completedJobs = await webScraperQueue.getJobs(['completed']); + const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000); + const jobIds = before24hJobs.map(job => job.id) as string[]; + let count = 0; + for (const jobId of jobIds) { + try { + await webScraperQueue.removeJobs(jobId); + count++; + } catch (jobError) { + console.error(`Failed to remove job with ID ${jobId}:`, jobError); + } + } + res.status(200).send(`Removed ${count} completed jobs.`); + } catch (error) { + console.error('Failed to clean last 24h complete jobs:', error); + res.status(500).send('Failed to clean jobs'); + } +}); app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index facc81e..81bf12c 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { waitFor?: number; screenshot?: boolean; headers?: Record; + replaceAllPathsWithAbsolutePaths?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 5344320..f432f43 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -303,9 +303,10 @@ export class WebScraperDataProvider { } private applyPathReplacements(documents: Document[]): Document[] { - return this.replaceAllPathsWithAbsolutePaths - ? replacePathsWithAbsolutePaths(documents) - : replaceImgPathsWithAbsolutePaths(documents); + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } + return replaceImgPathsWithAbsolutePaths(documents); } private async applyImgAltText(documents: Document[]): Promise { @@ -474,9 +475,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index aae567c..e201926 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,14 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' + content: 'This is a [link](/path/to/resource).', + markdown: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is a [link](https://example.com/path/to/resource).', + markdown: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +23,8 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is an [external link](https://external.com/path).', + markdown: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -31,7 +34,8 @@ describe('replacePaths', () => { it('should not alter data URLs for images', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' + content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +45,14 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).', + markdown: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).', + markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -56,12 +62,14 @@ describe('replacePaths', () => { it('should correctly handle a mix of absolute and relative paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -74,12 +82,14 @@ describe('replacePaths', () => { it('should replace relative image paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](/path/to/image.jpg).' + content: 'Here is an image: ![alt text](/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](/path/to/image.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' + content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -89,7 +99,8 @@ describe('replacePaths', () => { it('should not alter data:image URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).' + content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -99,12 +110,14 @@ describe('replacePaths', () => { it('should handle multiple images with a mix of data and relative URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' + content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).', + markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' + content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).', + markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index d652611..788916c 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] ) || []; paths.forEach((path: string) => { - const isImage = path.startsWith("!"); + try { + const isImage = path.startsWith("!"); let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); let url = matchedUrl[1]; @@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] } const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; - if (isImage) { - document.content = document.content.replace( - path, - `${markdownLinkOrImageText}(${url})` - ); - } else { + // Image is handled afterwards + if (!isImage) { document.content = document.content.replace( path, `${markdownLinkOrImageText}(${url})` ); + } + } catch (error) { + } }); + document.markdown = document.content; }); return documents; @@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen if (!imageUrl.startsWith("http")) { if (imageUrl.startsWith("/")) { imageUrl = imageUrl.substring(1); + imageUrl = new URL(imageUrl, baseUrl).toString(); + } else { + imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString(); } - imageUrl = new URL(imageUrl, baseUrl).toString(); } } @@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen `![${altText}](${imageUrl})` ); }); + document.markdown = document.content; }); return documents; diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index eca7d09..1f8d647 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -3,15 +3,17 @@ import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, data: any) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; let webhookUrl = selfHostedUrl; - if (!selfHostedUrl) { + // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set + // and the USE_DB_AUTHENTICATION environment variable is set to true + if (!selfHostedUrl && useDbAuthentication) { const { data: webhooksData, error } = await supabase_service .from("webhooks") .select("url") .eq("team_id", teamId) .limit(1); - if (error) { console.error( `Error fetching webhook URL for team ID: ${teamId}`,