From a04610302a7c0812183c240a2644d0c81de86597 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 16 Apr 2024 16:31:33 -0300 Subject: [PATCH 001/102] Spliting relative paths for images --- apps/api/src/index.ts | 4 +++ apps/api/src/scraper/WebScraper/index.ts | 37 ++++++++++++++++++------ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 7198988..26fb2a9 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -90,6 +90,7 @@ app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer const team_id = await authenticateUser(req, res, "scrape"); + const crawlerOptions = req.body.crawlerOptions ?? {}; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -113,6 +114,9 @@ app.post("/v0/scrape", async (req, res) => { await a.setOptions({ mode: "single_urls", urls: [url], + crawlerOptions: { + ...crawlerOptions, + }, }); const docs = await a.getDocuments(false); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index b54d9e6..8290762 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -74,7 +74,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (!useCaching) { + if (true) {//!useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], @@ -95,7 +95,7 @@ export class WebScraperDataProvider { } let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); - console.log("documents", documents) + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -122,6 +122,7 @@ export class WebScraperDataProvider { if (this.mode === "single_urls") { let documents = await this.convertUrlsToDocuments(this.urls, inProgress); + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -138,6 +139,7 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); documents = await this.getSitemapData(this.urls[0], documents); + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -297,29 +299,46 @@ export class WebScraperDataProvider { } generatesImgAltText = async (documents: Document[]): Promise => { await Promise.all(documents.map(async (document) => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; + const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; - await Promise.all(images.map(async (image) => { + await Promise.all(images.map(async (image: string) => { let imageUrl = image.match(/\(([^)]+)\)/)[1]; let altText = image.match(/\[(.*?)\]/)[1]; - let newImageUrl = ''; if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { - newImageUrl = baseUrl + imageUrl; const imageIndex = document.content.indexOf(image); const contentLength = document.content.length; let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); let frontTextStartIndex = Math.max(imageIndex - 1000, 0); let frontText = document.content.substring(frontTextStartIndex, imageIndex); - altText = await getImageDescription(newImageUrl, backText, frontText); + altText = await getImageDescription(imageUrl, backText, frontText); } - document.content = document.content.replace(image, `![${altText}](${newImageUrl})`); + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); })); })); return documents; } + + replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + documents.forEach(document => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; + + images.forEach(image => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + imageUrl = baseUrl + imageUrl; + } + + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); + }); + }); + + return documents; + } } From d23a7ae591fb21c28ec303bf160c6a51bede2635 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 16 Apr 2024 16:34:01 -0300 Subject: [PATCH 002/102] improving relative paths --- apps/api/src/scraper/WebScraper/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 8290762..6f368a1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -74,7 +74,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (true) {//!useCaching) { + if (!useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], From 00941d94a40ade640c6dfacbc567af8d4f04d426 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 16 Apr 2024 18:03:48 -0300 Subject: [PATCH 003/102] Added anthropic vision to getImageDescription function --- apps/api/.env.local | 1 + apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 18 ++++ apps/api/src/scraper/WebScraper/index.ts | 6 +- .../src/scraper/WebScraper/utils/gptVision.ts | 41 -------- .../WebScraper/utils/imageDescription.ts | 98 +++++++++++++++++++ 6 files changed, 122 insertions(+), 43 deletions(-) delete mode 100644 apps/api/src/scraper/WebScraper/utils/gptVision.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/imageDescription.ts diff --git a/apps/api/.env.local b/apps/api/.env.local index 301c64b..88133b7 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -7,6 +7,7 @@ SUPABASE_SERVICE_TOKEN= REDIS_URL= SCRAPING_BEE_API_KEY= OPENAI_API_KEY= +ANTHROPIC_API_KEY= BULL_AUTH_KEY= LOGTAIL_KEY= PLAYWRIGHT_MICROSERVICE_URL= diff --git a/apps/api/package.json b/apps/api/package.json index 9e3a3d8..a951aaf 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -39,6 +39,7 @@ "typescript": "^5.4.2" }, "dependencies": { + "@anthropic-ai/sdk": "^0.20.5", "@brillout/import": "^0.2.2", "@bull-board/api": "^5.14.2", "@bull-board/express": "^5.8.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 3539868..08b1de2 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -5,6 +5,9 @@ settings: excludeLinksFromLockfile: false dependencies: + '@anthropic-ai/sdk': + specifier: ^0.20.5 + version: 0.20.5 '@brillout/import': specifier: ^0.2.2 version: 0.2.3 @@ -213,6 +216,21 @@ packages: '@jridgewell/trace-mapping': 0.3.25 dev: true + /@anthropic-ai/sdk@0.20.5: + resolution: {integrity: sha512-d0ch+zp6/gHR4+2wqWV7JU1EJ7PpHc3r3F6hebovJTouY+pkaId1FuYYaVsG3l/gyqhOZUwKCMSMqcFNf+ZmWg==} + dependencies: + '@types/node': 18.19.22 + '@types/node-fetch': 2.6.11 + abort-controller: 3.0.0 + agentkeepalive: 4.5.0 + form-data-encoder: 1.7.2 + formdata-node: 4.4.1 + node-fetch: 2.7.0 + web-streams-polyfill: 3.3.3 + transitivePeerDependencies: + - encoding + dev: false + /@anthropic-ai/sdk@0.9.1: resolution: {integrity: sha512-wa1meQ2WSfoY8Uor3EdrJq0jTiZJoKoSii2ZVWRY1oN4Tlr5s59pADg9T79FTbPe1/se5c3pBeZgJL63wmuoBA==} dependencies: diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index b54d9e6..62ea16c 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -4,7 +4,7 @@ import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; -import { getImageDescription } from "./utils/gptVision"; +import { getImageDescription } from "./utils/imageDescription"; export type WebScraperOptions = { urls: string[]; @@ -16,6 +16,7 @@ export type WebScraperOptions = { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + generateImgAltTextModel?: "gpt-4-turbo" | "anthropic"; }; concurrentRequests?: number; }; @@ -29,6 +30,7 @@ export class WebScraperDataProvider { private limit: number = 10000; private concurrentRequests: number = 20; private generateImgAltText: boolean = false; + private generateImgAltTextModel: "gpt-4-turbo" | "anthropic" = "gpt-4-turbo"; authorize(): void { throw new Error("Method not implemented."); @@ -312,7 +314,7 @@ export class WebScraperDataProvider { let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); let frontTextStartIndex = Math.max(imageIndex - 1000, 0); let frontText = document.content.substring(frontTextStartIndex, imageIndex); - altText = await getImageDescription(newImageUrl, backText, frontText); + altText = await getImageDescription(newImageUrl, backText, frontText, this.generateImgAltTextModel); } document.content = document.content.replace(image, `![${altText}](${newImageUrl})`); diff --git a/apps/api/src/scraper/WebScraper/utils/gptVision.ts b/apps/api/src/scraper/WebScraper/utils/gptVision.ts deleted file mode 100644 index 7458a56..0000000 --- a/apps/api/src/scraper/WebScraper/utils/gptVision.ts +++ /dev/null @@ -1,41 +0,0 @@ -export async function getImageDescription( - imageUrl: string, - backText: string, - frontText: string -): Promise { - const { OpenAI } = require("openai"); - const openai = new OpenAI(); - - try { - const response = await openai.chat.completions.create({ - model: "gpt-4-turbo", - messages: [ - { - role: "user", - content: [ - { - type: "text", - text: - "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " + - backText + - " and the following text: " + - frontText + - ". Be super concise.", - }, - { - type: "image_url", - image_url: { - url: imageUrl, - }, - }, - ], - }, - ], - }); - - return response.choices[0].message.content; - } catch (error) { - console.error("Error generating image alt text:", error?.message); - return ""; - } -} diff --git a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts new file mode 100644 index 0000000..d2db37b --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts @@ -0,0 +1,98 @@ +import Anthropic from '@anthropic-ai/sdk'; +import axios from 'axios'; + +export async function getImageDescription( + imageUrl: string, + backText: string, + frontText: string, + model: string = "gpt-4-turbo" +): Promise { + try { + const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " + + backText + + " and the following text: " + + frontText + + ". Be super concise." + + switch (model) { + case 'anthropic': { + if (!process.env.ANTHROPIC_API_KEY) { + throw new Error("No Anthropic API key provided"); + } + const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' }); + const imageMediaType = 'image/png'; + const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64'); + + const anthropic = new Anthropic(); + const response = await anthropic.messages.create({ + model: "claude-3-opus-20240229", + max_tokens: 1024, + messages: [ + { + role: "user", + content: [ + { + type: "image", + source: { + type: "base64", + media_type: imageMediaType, + data: imageData, + }, + }, + { + type: "text", + text: prompt + } + ], + } + ] + }); + + return response.content[0].text; + + // const response = await anthropic.messages.create({ + // messages: [ + // { + // role: "user", + // content: prompt, + // }, + // ], + // }); + + } + default: { + if (!process.env.OPENAI_API_KEY) { + throw new Error("No OpenAI API key provided"); + } + + const { OpenAI } = require("openai"); + const openai = new OpenAI(); + + const response = await openai.chat.completions.create({ + model: "gpt-4-turbo", + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: prompt, + }, + { + type: "image_url", + image_url: { + url: imageUrl, + }, + }, + ], + }, + ], + }); + return response.choices[0].message.content; + } + } + } catch (error) { + console.error("Error generating image alt text:", error?.message); + return ""; + } +} From ed5dc808c7f356d7a5f63a38ed42d2d087463d23 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 16 Apr 2024 18:05:07 -0300 Subject: [PATCH 004/102] Update imageDescription.ts --- .../src/scraper/WebScraper/utils/imageDescription.ts | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts index d2db37b..a01c757 100644 --- a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts +++ b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts @@ -49,16 +49,6 @@ export async function getImageDescription( }); return response.content[0].text; - - // const response = await anthropic.messages.create({ - // messages: [ - // { - // role: "user", - // content: prompt, - // }, - // ], - // }); - } default: { if (!process.env.OPENAI_API_KEY) { From 23d391bb2b053616ad61f17d1e8aad2c04984935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Wed, 17 Apr 2024 07:09:14 +0200 Subject: [PATCH 005/102] Delete .DS_Store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 820b40ca6d639737ccf8e2f95a9ffc4639ea2ef3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKO>fgc5S>i}aa1AY08)>ZxVAL36{zB3g&=xB959pvpkOCaYjC`lKd?{~$!GXO z{1Sc-Z)P_YB}Kg<1Uu5~+nJpi@4ijEULsPXdUl^^KtvvtvC~I$h447*hIFi_3sh3$ zoN9{mDt=a%8_~}3H!{F`*Q0xMMhR8))BaTjMouWC&s1V$P7B(nQ__@Tj~B2@*kg(? z7NPeF_M8gwW~eol$O!fn;|pjZabx(WwnBznuJTl#eZs#TSPkmLv?rOIlS|z_`XI4a z&Gva+%4~mG2j<_E{G7_TDf2khF?`?RJWa}KIQ*_UTiqKsdtT4m_TGi(x(TbOs>@MS zykYC5)@i)%NAYPgZzhA=k9Af>Nj5JeA(_n)^7eI-&2%%;bv7#{H#7rY-|J5XJIm$a z@zJh-^zdM{>o1QFhUgzYSgrcroxA&wUQ9lwi%fsP+#m}7B#~Pd*YFjci}id8i!9aI z8GP&bb(}-TfMdWha6JR|&EaidAL=3;1CD|Ji~)W>_)x~cVrx)d9cbhV0OZd|5%l>k z!8yLgz+!67?xCL)j-QdqYw3bkz4HoK#@YwT=PD zz-0!uj5*-*|H<#~|Cf`THeuq1o7ZY++^S|9oz%EEE2!HX1h Date: Wed, 17 Apr 2024 07:10:06 +0200 Subject: [PATCH 006/102] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2a7c2a5..cbfb076 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ +.DS_Store /node_modules/ /dist/ .env *.csv dump.rdb /mongo-data -apps/js-sdk/node_modules/ \ No newline at end of file +apps/js-sdk/node_modules/ From 34ab21db5964c97f3b27cbf6462d1151d4f258e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Wed, 17 Apr 2024 05:13:27 +0000 Subject: [PATCH 007/102] Fix typos --- apps/api/src/index.ts | 6 +++--- apps/api/src/main/runWebScraper.ts | 4 ++-- apps/api/src/services/rate-limiter.ts | 4 ++-- apps/playwright-service/.gitignore | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 7198988..476eeb3 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -129,15 +129,15 @@ app.post("/v0/scrape", async (req, res) => { filteredDocs.length ); if (!success) { - // throw new Error("Failed to bill team, no subscribtion was found"); + // throw new Error("Failed to bill team, no subscription was found"); // return { // success: false, - // message: "Failed to bill team, no subscribtion was found", + // message: "Failed to bill team, no subscription was found", // docs: [], // }; return res .status(402) - .json({ error: "Failed to bill, no subscribtion was found" }); + .json({ error: "Failed to bill, no subscription was found" }); } return res.json({ success: true, diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 9798297..762e153 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -79,10 +79,10 @@ export async function runWebScraper({ filteredDocs.length ); if (!success) { - // throw new Error("Failed to bill team, no subscribtion was found"); + // throw new Error("Failed to bill team, no subscription was found"); return { success: false, - message: "Failed to bill team, no subscribtion was found", + message: "Failed to bill team, no subscription was found", docs: [], }; } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index a12e6d8..8e2fe3b 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -3,7 +3,7 @@ import * as redis from "redis"; const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; const MAX_CRAWLS_PER_MINUTE_STARTER = 2; -const MAX_CRAWLS_PER_MINUTE_STANDAR = 4; +const MAX_CRAWLS_PER_MINUTE_STANDARD = 4; const MAX_CRAWLS_PER_MINUTE_SCALE = 20; const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 40; @@ -35,7 +35,7 @@ export function crawlRateLimit(plan: string){ return new RateLimiterRedis({ storeClient: redisClient, keyPrefix: "middleware", - points: MAX_CRAWLS_PER_MINUTE_STANDAR, + points: MAX_CRAWLS_PER_MINUTE_STANDARD, duration: 60, // Duration in seconds }); }else if(plan === "scale"){ diff --git a/apps/playwright-service/.gitignore b/apps/playwright-service/.gitignore index d9005f2..de2d5e0 100644 --- a/apps/playwright-service/.gitignore +++ b/apps/playwright-service/.gitignore @@ -145,7 +145,7 @@ dmypy.json cython_debug/ # PyCharm -# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. From 51f94e9e413df0ba818385d5ae115658f0f56283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Wed, 17 Apr 2024 08:53:01 +0200 Subject: [PATCH 008/102] Delete apps/.DS_Store --- apps/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 apps/.DS_Store diff --git a/apps/.DS_Store b/apps/.DS_Store deleted file mode 100644 index 61e35a269e128530319b78e078816f56129b60ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKQA@)x5WZ~Fb&4=VVUGo073Ve-@uk%H2dwCW%4}`f%4###Ze@%?pY;#a<<576cACX9Wa z*?biHiHLRNhEXvpooa15ZJxIF&DKd{wr@^P8+Ejs$Fo_**xfrkyX^F%r;`qNn($j<~{v>tGXfq3zuzao5mncJuQ=lp40Ee6y*F*a8# zEuD>KKr`?Q4AA>vq7qsPbBW^Vz(KSCi1ZsF1a0aiD8^K1Da<9}4hoZ@h%%I^7K6!f zTsM`Ur7)K$!-1*hgQ+hw)uAx)bv)k`?!YXGdejVP1{N78%4wDA|Hb$H|6-7yX$CX{ zE5!gSwVk$&DXH4JlpNJs6ZHX=g#2=eA_NCjj)g&`xP~eO_e?SnErq#6TtU%40)hrT JXa@e2fp>Rfl}G>p From 11394ef23600915694fd64779d1ba3f2a9adc323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Wed, 17 Apr 2024 08:53:12 +0200 Subject: [PATCH 009/102] Delete apps/api/src/.DS_Store --- apps/api/src/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 apps/api/src/.DS_Store diff --git a/apps/api/src/.DS_Store b/apps/api/src/.DS_Store deleted file mode 100644 index bbd93bc18e19db173a03e7b011153a92b1dd3766..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKOHRWu5S>Yb6x2$WRMijN=XX->JxyDdRozyw6tCIE-P_H_RoA;b;$1&>SC#h+FVlwZ>4~0Eb%EKY zUpc=#6w}><|9p8r*F6l@YTEs7j>Yx&==;m zgM$kCgh4TWIxxf|05FF+3g+=259q`I^a+C^ED+XIpr&#T&IjDbD_BfXqR|G)nJ{NGQqD`UVI_)`pUUe3!IZYlNF!Ocmp sb`o>?pD-xG0hNU1!VX}@Bjb+ From d628511b57bea46b8eb142e89e15e39e8fd8f8ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Wed, 17 Apr 2024 08:53:23 +0200 Subject: [PATCH 010/102] Delete apps/playwright-service/.DS_Store --- apps/playwright-service/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 apps/playwright-service/.DS_Store diff --git a/apps/playwright-service/.DS_Store b/apps/playwright-service/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Wed, 17 Apr 2024 12:59:49 -0300 Subject: [PATCH 011/102] [bugfix] added normalized apikey to craw/status route --- apps/api/src/index.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 7198988..0e89c6a 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -256,11 +256,13 @@ app.get("/v0/crawl/status/:jobId", async (req, res) => { return res.status(401).json({ error: "Unauthorized: Token missing" }); } + const normalizedApi = parseApi(token); // make sure api key is valid, based on the api_keys table in supabase const { data, error } = await supabase_service .from("api_keys") .select("*") - .eq("key", token); + .eq("key", normalizedApi); + if (error || !data || data.length === 0) { return res.status(401).json({ error: "Unauthorized: Invalid token" }); } From 27674a624d93de19928f1e89a3db1e134cf300c8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 10:39:00 -0700 Subject: [PATCH 012/102] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 62ea16c..ce9c7bf 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -16,7 +16,7 @@ export type WebScraperOptions = { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; - generateImgAltTextModel?: "gpt-4-turbo" | "anthropic"; + generateImgAltTextModel?: "gpt-4-turbo" | "claude-3-opus"; }; concurrentRequests?: number; }; @@ -30,7 +30,7 @@ export class WebScraperDataProvider { private limit: number = 10000; private concurrentRequests: number = 20; private generateImgAltText: boolean = false; - private generateImgAltTextModel: "gpt-4-turbo" | "anthropic" = "gpt-4-turbo"; + private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; authorize(): void { throw new Error("Method not implemented."); From db15724b0c9573ad0c463fca76e96b1239be9df3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 10:39:29 -0700 Subject: [PATCH 013/102] Update imageDescription.ts --- apps/api/src/scraper/WebScraper/utils/imageDescription.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts index a01c757..3d780ab 100644 --- a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts +++ b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts @@ -15,7 +15,7 @@ export async function getImageDescription( ". Be super concise." switch (model) { - case 'anthropic': { + case 'claude-3-opus': { if (!process.env.ANTHROPIC_API_KEY) { throw new Error("No Anthropic API key provided"); } From 82ed9515f1f2e25578ce31d7b8ae71ecf9241e6e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 10:52:10 -0700 Subject: [PATCH 014/102] Update index.ts --- apps/api/src/index.ts | 47 +++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0e89c6a..0663c5c 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -45,14 +45,14 @@ app.get("/test", async (req, res) => { res.send("Hello, world!"); }); -async function authenticateUser(req, res, mode?: string): Promise { +async function authenticateUser(req, res, mode?: string): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { const authHeader = req.headers.authorization; if (!authHeader) { - return res.status(401).json({ error: "Unauthorized" }); + return { success: false, error: "Unauthorized", status: 401 }; } const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " if (!token) { - return res.status(401).json({ error: "Unauthorized: Token missing" }); + return { success: false, error: "Unauthorized: Token missing", status: 401 }; } try { @@ -64,13 +64,11 @@ async function authenticateUser(req, res, mode?: string): Promise { ).consume(iptoken); } catch (rateLimiterRes) { console.error(rateLimiterRes); - return res.status(429).json({ - error: "Rate limit exceeded. Too many requests, try again in 1 minute.", - }); + return { success: false, error: "Rate limit exceeded. Too many requests, try again in 1 minute.", status: 429 }; } if (token === "this_is_just_a_preview_token" && mode === "scrape") { - return "preview"; + return { success: true, team_id: "preview" }; } const normalizedApi = parseApi(token); @@ -80,16 +78,19 @@ async function authenticateUser(req, res, mode?: string): Promise { .select("*") .eq("key", normalizedApi); if (error || !data || data.length === 0) { - return res.status(401).json({ error: "Unauthorized: Invalid token" }); + return { success: false, error: "Unauthorized: Invalid token", status: 401 }; } - return data[0].team_id; + return { success: true, team_id: data[0].team_id }; } app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer - const team_id = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + if (!success) { + return res.status(status).json({ error }); + } try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -155,7 +156,10 @@ app.post("/v0/scrape", async (req, res) => { app.post("/v0/crawl", async (req, res) => { try { - const team_id = await authenticateUser(req, res); + const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + if (!success) { + return res.status(status).json({ error }); + } const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); @@ -247,24 +251,9 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => { app.get("/v0/crawl/status/:jobId", async (req, res) => { try { - const authHeader = req.headers.authorization; - if (!authHeader) { - return res.status(401).json({ error: "Unauthorized" }); - } - const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " - if (!token) { - return res.status(401).json({ error: "Unauthorized: Token missing" }); - } - - const normalizedApi = parseApi(token); - // make sure api key is valid, based on the api_keys table in supabase - const { data, error } = await supabase_service - .from("api_keys") - .select("*") - .eq("key", normalizedApi); - - if (error || !data || data.length === 0) { - return res.status(401).json({ error: "Unauthorized: Invalid token" }); + const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + if (!success) { + return res.status(status).json({ error }); } const job = await getWebScraperQueue().getJob(req.params.jobId); if (!job) { From b375ce3e39df3ce0a44bf1778ca389b0fe04bdf2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:54:54 -0300 Subject: [PATCH 015/102] adding unit tests and bugfixing --- .../WebScraper/__tests__/index.test.ts | 97 +++++++++++++++++++ apps/api/src/scraper/WebScraper/index.ts | 15 ++- 2 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/__tests__/index.test.ts diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts new file mode 100644 index 0000000..e060d16 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -0,0 +1,97 @@ +import { WebScraperDataProvider } from '../index'; + +describe('WebScraperDataProvider', () => { + describe('replaceImgPathsWithAbsolutePaths', () => { + it('should replace image paths with absolute paths', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](./another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](https://example.com/another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should handle absolute URLs without modification', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not replace non-image content within the documents', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 6f368a1..727b597 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -325,19 +325,24 @@ export class WebScraperDataProvider { documents.forEach(document => { const baseUrl = new URL(document.metadata.sourceURL).origin; const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; - + images.forEach(image => { let imageUrl = image.match(/\(([^)]+)\)/)[1]; let altText = image.match(/\[(.*?)\]/)[1]; - + if (!imageUrl.startsWith("data:image")) { - imageUrl = baseUrl + imageUrl; + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } } - + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); }); }); - + return documents; } } From 417921ea339f5252bec5864acfbbee1f05ce6368 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 10:57:01 -0700 Subject: [PATCH 016/102] Update index.ts --- apps/api/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0663c5c..9358672 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -87,7 +87,7 @@ async function authenticateUser(req, res, mode?: string): Promise<{ success: boo app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, "crawl"); if (!success) { return res.status(status).json({ error }); } From 2eb81545fa50e8aee61c855bc00fe3f1625c41e2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 11:04:03 -0700 Subject: [PATCH 017/102] Update index.test.ts --- .../WebScraper/__tests__/index.test.ts | 166 +++++++++++++----- 1 file changed, 120 insertions(+), 46 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts index e060d16..49b3926 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -1,97 +1,171 @@ -import { WebScraperDataProvider } from '../index'; +import { WebScraperDataProvider } from "../index"; -describe('WebScraperDataProvider', () => { - describe('replaceImgPathsWithAbsolutePaths', () => { - it('should replace image paths with absolute paths', () => { +describe("WebScraperDataProvider", () => { + describe("replaceImgPathsWithAbsolutePaths", () => { + it("should replace image paths with absolute paths", () => { const webScraperDataProvider = new WebScraperDataProvider(); const documents = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](./another-image.png)', + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](./another-image.png)", }, { - metadata: { sourceURL: 'https://example.com/data-image' }, - content: '![data image](data:image/png;base64,...)', - } + metadata: { sourceURL: "https://example.com/data-image" }, + content: "![data image](data:image/png;base64,...)", + }, ]; const expectedDocuments = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](https://example.com/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](https://example.com/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](https://example.com/another-image.png)', + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](https://example.com/another-image.png)", }, { - metadata: { sourceURL: 'https://example.com/data-image' }, - content: '![data image](data:image/png;base64,...)', - } + metadata: { sourceURL: "https://example.com/data-image" }, + content: "![data image](data:image/png;base64,...)", + }, ]; - const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); expect(result).toEqual(expectedDocuments); }); - it('should handle absolute URLs without modification', () => { + it("should handle absolute URLs without modification", () => { const webScraperDataProvider = new WebScraperDataProvider(); const documents = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](https://example.com/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](https://example.com/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](http://anotherexample.com/another-image.png)', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "![another alt text](http://anotherexample.com/another-image.png)", + }, ]; const expectedDocuments = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](https://example.com/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](https://example.com/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](http://anotherexample.com/another-image.png)', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "![another alt text](http://anotherexample.com/another-image.png)", + }, ]; - const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); expect(result).toEqual(expectedDocuments); }); - it('should not replace non-image content within the documents', () => { + it("should not replace non-image content within the documents", () => { const webScraperDataProvider = new WebScraperDataProvider(); const documents = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).', + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](./another-image.png) Here is some **bold text**.", + }, ]; - + const expectedDocuments = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).', + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.", + }, ]; - - const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + it("should replace multiple image paths within the documents", () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)", + }, + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)", + }, + ]; + + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it("should replace image paths within the documents with complex URLs", () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: "https://example.com/page/subpage" }, + content: + "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page/subpage" }, + content: + "Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)", + }, + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: "https://example.com/page/subpage" }, + content: + "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page/subpage" }, + content: + "Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)", + }, + ]; + + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); expect(result).toEqual(expectedDocuments); }); }); -}); \ No newline at end of file +}); From 08ed68ff5592bc31897f7849f625f277a24a9bf1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:44:23 -0700 Subject: [PATCH 018/102] Nick: fixes --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 7 +++++++ apps/api/src/lib/html-to-markdown.ts | 4 +++- apps/api/src/scraper/WebScraper/single_url.ts | 1 - apps/api/src/scraper/WebScraper/utils/parseTable.ts | 1 - 5 files changed, 11 insertions(+), 3 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 9e3a3d8..e8e5e02 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -82,6 +82,7 @@ "scrapingbee": "^1.7.4", "stripe": "^12.2.0", "turndown": "^7.1.3", + "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", "unstructured-client": "^0.9.4", "uuid": "^9.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 3539868..8142189 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -134,6 +134,9 @@ dependencies: turndown: specifier: ^7.1.3 version: 7.1.3 + turndown-plugin-gfm: + specifier: ^1.0.2 + version: 1.0.2 typesense: specifier: ^1.5.4 version: 1.7.2(@babel/runtime@7.24.0) @@ -5783,6 +5786,10 @@ packages: resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==} dev: false + /turndown-plugin-gfm@1.0.2: + resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==} + dev: false + /turndown@7.1.3: resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==} dependencies: diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 6c816ab..0fd8c93 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,5 +1,6 @@ export function parseMarkdown(html: string) { var TurndownService = require("turndown"); + var turndownPluginGfm = require("turndown-plugin-gfm"); const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { @@ -16,7 +17,8 @@ export function parseMarkdown(html: string) { return "[" + content.trim() + "](" + href + title + ")\n"; }, }); - + var gfm = turndownPluginGfm.gfm; + turndownService.use(gfm); let markdownContent = turndownService.turndown(html); // multiple line links diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index faba56c..f71221c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -134,7 +134,6 @@ export async function scrapSingleUrl( break; } let cleanedHtml = removeUnwantedElements(text); - cleanedHtml = await parseTablesToMarkdown(cleanedHtml); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts index 7d0a602..9855650 100644 --- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -24,7 +24,6 @@ export const parseTablesToMarkdown = async (html: string): Promise => { if (isTableEmpty) { markdownTable = ''; } - console.log({markdownTable}) replacements.push({ start, end, markdownTable }); }); } From 871d5d91b0fa0d477425bbdc512edcbaa4a33f56 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:51:12 -0700 Subject: [PATCH 019/102] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 284 +++++++++++++++-------- 1 file changed, 190 insertions(+), 94 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 727b597..ecb2fff 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -49,19 +49,21 @@ export class WebScraperDataProvider { const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); - await Promise.all(batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true); - processedUrls++; - if (inProgress) { - inProgress({ - current: processedUrls, - total: totalUrls, - status: "SCRAPING", - currentDocumentUrl: url, - }); - } - results[i + index] = result; - })); + await Promise.all( + batchUrls.map(async (url, index) => { + const result = await scrapSingleUrl(url, true); + processedUrls++; + if (inProgress) { + inProgress({ + current: processedUrls, + total: totalUrls, + status: "SCRAPING", + currentDocumentUrl: url, + }); + } + results[i + index] = result; + }) + ); } return results.filter((result) => result !== null) as Document[]; } @@ -102,33 +104,58 @@ export class WebScraperDataProvider { // CACHING DOCUMENTS // - parent document - const cachedParentDocumentString = await getValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0])); + const cachedParentDocumentString = await getValue( + "web-scraper-cache:" + this.normalizeUrl(this.urls[0]) + ); if (cachedParentDocumentString != null) { let cachedParentDocument = JSON.parse(cachedParentDocumentString); - if (!cachedParentDocument.childrenLinks || cachedParentDocument.childrenLinks.length < links.length - 1) { - cachedParentDocument.childrenLinks = links.filter((link) => link !== this.urls[0]); - await setValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]), JSON.stringify(cachedParentDocument), 60 * 60 * 24 * 10); // 10 days + if ( + !cachedParentDocument.childrenLinks || + cachedParentDocument.childrenLinks.length < links.length - 1 + ) { + cachedParentDocument.childrenLinks = links.filter( + (link) => link !== this.urls[0] + ); + await setValue( + "web-scraper-cache:" + this.normalizeUrl(this.urls[0]), + JSON.stringify(cachedParentDocument), + 60 * 60 * 24 * 10 + ); // 10 days } } else { - let parentDocument = documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) === this.normalizeUrl(this.urls[0])) + let parentDocument = documents.filter( + (document) => + this.normalizeUrl(document.metadata.sourceURL) === + this.normalizeUrl(this.urls[0]) + ); await this.setCachedDocuments(parentDocument, links); } - await this.setCachedDocuments(documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) !== this.normalizeUrl(this.urls[0])), []); + await this.setCachedDocuments( + documents.filter( + (document) => + this.normalizeUrl(document.metadata.sourceURL) !== + this.normalizeUrl(this.urls[0]) + ), + [] + ); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); return documents; } if (this.mode === "single_urls") { - let documents = await this.convertUrlsToDocuments(this.urls, inProgress); + let documents = await this.convertUrlsToDocuments( + this.urls, + inProgress + ); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } const baseUrl = new URL(this.urls[0]).origin; documents = await this.getSitemapData(baseUrl, documents); - + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -136,14 +163,17 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { const links = await getLinksFromSitemap(this.urls[0]); - let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); + let documents = await this.convertUrlsToDocuments( + links.slice(0, this.limit), + inProgress + ); documents = await this.getSitemapData(this.urls[0], documents); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } - + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -153,11 +183,22 @@ export class WebScraperDataProvider { return []; } - let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit)); + let documents = await this.getCachedDocuments( + this.urls.slice(0, this.limit) + ); if (documents.length < this.limit) { - const newDocuments: Document[] = await this.getDocuments(false, inProgress); - newDocuments.forEach(doc => { - if (!documents.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) { + const newDocuments: Document[] = await this.getDocuments( + false, + inProgress + ); + newDocuments.forEach((doc) => { + if ( + !documents.some( + (d) => + this.normalizeUrl(d.metadata.sourceURL) === + this.normalizeUrl(doc.metadata?.sourceURL) + ) + ) { documents.push(doc); } }); @@ -173,17 +214,23 @@ export class WebScraperDataProvider { const url = new URL(document.metadata.sourceURL); const path = url.pathname; - if (this.excludes.length > 0 && this.excludes[0] !== '') { + if (this.excludes.length > 0 && this.excludes[0] !== "") { // Check if the link should be excluded - if (this.excludes.some(excludePattern => new RegExp(excludePattern).test(path))) { + if ( + this.excludes.some((excludePattern) => + new RegExp(excludePattern).test(path) + ) + ) { return false; } } - - if (this.includes.length > 0 && this.includes[0] !== '') { + + if (this.includes.length > 0 && this.includes[0] !== "") { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0) { - return this.includes.some(includePattern => new RegExp(includePattern).test(path)); + return this.includes.some((includePattern) => + new RegExp(includePattern).test(path) + ); } } return true; @@ -200,7 +247,7 @@ export class WebScraperDataProvider { private removeChildLinks(documents: Document[]): Document[] { for (let document of documents) { if (document?.childrenLinks) delete document.childrenLinks; - }; + } return documents; } @@ -210,10 +257,14 @@ export class WebScraperDataProvider { continue; } const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL); - await setValue('web-scraper-cache:' + normalizedUrl, JSON.stringify({ - ...document, - childrenLinks: childrenLinks || [] - }), 60 * 60 * 24 * 10); // 10 days + await setValue( + "web-scraper-cache:" + normalizedUrl, + JSON.stringify({ + ...document, + childrenLinks: childrenLinks || [], + }), + 60 * 60 * 24 * 10 + ); // 10 days } } @@ -221,8 +272,12 @@ export class WebScraperDataProvider { let documents: Document[] = []; for (const url of urls) { const normalizedUrl = this.normalizeUrl(url); - console.log("Getting cached document for web-scraper-cache:" + normalizedUrl) - const cachedDocumentString = await getValue('web-scraper-cache:' + normalizedUrl); + console.log( + "Getting cached document for web-scraper-cache:" + normalizedUrl + ); + const cachedDocumentString = await getValue( + "web-scraper-cache:" + normalizedUrl + ); if (cachedDocumentString) { const cachedDocument = JSON.parse(cachedDocumentString); documents.push(cachedDocument); @@ -230,10 +285,18 @@ export class WebScraperDataProvider { // get children documents for (const childUrl of cachedDocument.childrenLinks) { const normalizedChildUrl = this.normalizeUrl(childUrl); - const childCachedDocumentString = await getValue('web-scraper-cache:' + normalizedChildUrl); + const childCachedDocumentString = await getValue( + "web-scraper-cache:" + normalizedChildUrl + ); if (childCachedDocumentString) { const childCachedDocument = JSON.parse(childCachedDocumentString); - if (!documents.find((doc) => doc.metadata.sourceURL === childCachedDocument.metadata.sourceURL)) { + if ( + !documents.find( + (doc) => + doc.metadata.sourceURL === + childCachedDocument.metadata.sourceURL + ) + ) { documents.push(childCachedDocument); } } @@ -248,7 +311,7 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } - console.log("options", options.crawlerOptions?.excludes) + console.log("options", options.crawlerOptions?.excludes); this.urls = options.urls; this.mode = options.mode; this.concurrentRequests = options.concurrentRequests ?? 20; @@ -257,13 +320,12 @@ export class WebScraperDataProvider { this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.limit = options.crawlerOptions?.limit ?? 10000; - this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - + this.generateImgAltText = + options.crawlerOptions?.generateImgAltText ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check - this.excludes = this.excludes.filter(item => item !== ''); - - + this.excludes = this.excludes.filter((item) => item !== ""); + // make sure all urls start with https:// this.urls = this.urls.map((url) => { if (!url.trim().startsWith("http")) { @@ -274,10 +336,14 @@ export class WebScraperDataProvider { } private async getSitemapData(baseUrl: string, documents: Document[]) { - const sitemapData = await fetchSitemapData(baseUrl) + const sitemapData = await fetchSitemapData(baseUrl); if (sitemapData) { for (let i = 0; i < documents.length; i++) { - const docInSitemapData = sitemapData.find((data) => this.normalizeUrl(data.loc) === this.normalizeUrl(documents[i].metadata.sourceURL)) + const docInSitemapData = sitemapData.find( + (data) => + this.normalizeUrl(data.loc) === + this.normalizeUrl(documents[i].metadata.sourceURL) + ); if (docInSitemapData) { let sitemapDocData: Partial = {}; if (docInSitemapData.changefreq) { @@ -298,52 +364,82 @@ export class WebScraperDataProvider { return documents; } generatesImgAltText = async (documents: Document[]): Promise => { - await Promise.all(documents.map(async (document) => { - const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; + await Promise.all( + documents.map(async (document) => { + const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; - await Promise.all(images.map(async (image: string) => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; + await Promise.all( + images.map(async (image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; - if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { - const imageIndex = document.content.indexOf(image); - const contentLength = document.content.length; - let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); - let frontTextStartIndex = Math.max(imageIndex - 1000, 0); - let frontText = document.content.substring(frontTextStartIndex, imageIndex); - altText = await getImageDescription(imageUrl, backText, frontText); - } - - document.content = document.content.replace(image, `![${altText}](${imageUrl})`); - })); - })); - - return documents; - } - - replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { - documents.forEach(document => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; - - images.forEach(image => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; - - if (!imageUrl.startsWith("data:image")) { - if (!imageUrl.startsWith("http")) { - if (imageUrl.startsWith("/")) { - imageUrl = imageUrl.substring(1); + if ( + !altText && + !imageUrl.startsWith("data:image") && + /\.(png|jpeg|gif|webp)$/.test(imageUrl) + ) { + const imageIndex = document.content.indexOf(image); + const contentLength = document.content.length; + let backText = document.content.substring( + imageIndex + image.length, + Math.min(imageIndex + image.length + 1000, contentLength) + ); + let frontTextStartIndex = Math.max(imageIndex - 1000, 0); + let frontText = document.content.substring( + frontTextStartIndex, + imageIndex + ); + altText = await getImageDescription( + imageUrl, + backText, + frontText + ); } - imageUrl = new URL(imageUrl, baseUrl).toString(); - } - } - - document.content = document.content.replace(image, `![${altText}](${imageUrl})`); - }); - }); - - return documents; - } -} + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }) + ); + }) + ); + + return documents; + }; + + replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + try { + documents.forEach((document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = + document.content.match( + /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g + ) || []; + + images.forEach((image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } + } + + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }); + }); + + return documents; + } catch (error) { + return documents; + } + }; +} From de439f6529111b2f839dc8a8ef126310e1a0b31d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:51:29 -0700 Subject: [PATCH 020/102] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index ecb2fff..501dde0 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -439,6 +439,7 @@ export class WebScraperDataProvider { return documents; } catch (error) { + console.error("Error replacing img paths with absolute paths", error); return documents; } }; From 52fb28bc1a943d6489f85fb93061f8c01bf6c0f1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:52:15 -0700 Subject: [PATCH 021/102] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 501dde0..e1bd425 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -311,7 +311,6 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } - console.log("options", options.crawlerOptions?.excludes); this.urls = options.urls; this.mode = options.mode; this.concurrentRequests = options.concurrentRequests ?? 20; From 36abe0f7f9d990a07966edf59da1c8326b31b6de Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 18:24:46 -0700 Subject: [PATCH 022/102] Nick: --- apps/api/src/index.ts | 9 +++ apps/api/src/lib/entities.ts | 18 ++++++ apps/api/src/main/runWebScraper.ts | 6 +- .../WebScraper/__tests__/index.test.ts | 8 +++ apps/api/src/scraper/WebScraper/index.ts | 20 ++----- apps/api/src/scraper/WebScraper/single_url.ts | 17 ++++-- .../scraper/WebScraper/utils/excludeTags.ts | 60 +++++++++++++++++++ apps/api/src/types.ts | 2 + 8 files changed, 120 insertions(+), 20 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/excludeTags.ts diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index db08587..fad8fa3 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -110,6 +110,8 @@ app.post("/v0/scrape", async (req, res) => { return res.status(400).json({ error: "Url is required" }); } + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + try { const a = new WebScraperDataProvider(); await a.setOptions({ @@ -118,6 +120,7 @@ app.post("/v0/scrape", async (req, res) => { crawlerOptions: { ...crawlerOptions, }, + pageOptions: pageOptions, }); const docs = await a.getDocuments(false); @@ -178,6 +181,7 @@ app.post("/v0/crawl", async (req, res) => { } const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; if (mode === "single_urls" && !url.includes(",")) { try { @@ -188,6 +192,7 @@ app.post("/v0/crawl", async (req, res) => { crawlerOptions: { returnOnlyUrls: true, }, + pageOptions: pageOptions, }); const docs = await a.getDocuments(false, (progress) => { @@ -212,6 +217,8 @@ app.post("/v0/crawl", async (req, res) => { mode: mode ?? "crawl", // fix for single urls not working crawlerOptions: { ...crawlerOptions }, team_id: team_id, + pageOptions: pageOptions, + }); res.json({ jobId: job.id }); @@ -239,11 +246,13 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => { } const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const job = await addWebScraperJob({ url: url, mode: mode ?? "crawl", // fix for single urls not working crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, team_id: "preview", + pageOptions: pageOptions, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 1e681a9..c332914 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -9,6 +9,24 @@ export interface Progress { currentDocumentUrl?: string; } +export type PageOptions = { + onlyMainContent?: boolean; +}; +export type WebScraperOptions = { + urls: string[]; + mode: "single_urls" | "sitemap" | "crawl"; + crawlerOptions?: { + returnOnlyUrls?: boolean; + includes?: string[]; + excludes?: string[]; + maxCrawledLinks?: number; + limit?: number; + generateImgAltText?: boolean; + }; + pageOptions?: PageOptions; + concurrentRequests?: number; +}; + export class Document { id?: string; content: string; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 762e153..c43b1b3 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -13,6 +13,7 @@ export async function startWebScraperPipeline({ url: job.data.url, mode: job.data.mode, crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, inProgress: (progress) => { job.progress(progress); }, @@ -29,6 +30,7 @@ export async function runWebScraper({ url, mode, crawlerOptions, + pageOptions, inProgress, onSuccess, onError, @@ -37,6 +39,7 @@ export async function runWebScraper({ url: string; mode: "crawl" | "single_urls" | "sitemap"; crawlerOptions: any; + pageOptions?: any; inProgress: (progress: any) => void; onSuccess: (result: any) => void; onError: (error: any) => void; @@ -44,18 +47,19 @@ export async function runWebScraper({ }): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { try { const provider = new WebScraperDataProvider(); - if (mode === "crawl") { await provider.setOptions({ mode: mode, urls: [url], crawlerOptions: crawlerOptions, + pageOptions: pageOptions, }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, + pageOptions: pageOptions, }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts index 49b3926..42d9513 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -13,6 +13,10 @@ describe("WebScraperDataProvider", () => { metadata: { sourceURL: "https://example.com/another-page" }, content: "![another alt text](./another-image.png)", }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](./another-image.webp)", + }, { metadata: { sourceURL: "https://example.com/data-image" }, content: "![data image](data:image/png;base64,...)", @@ -28,6 +32,10 @@ describe("WebScraperDataProvider", () => { metadata: { sourceURL: "https://example.com/another-page" }, content: "![another alt text](https://example.com/another-image.png)", }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](https://example.com/another-image.webp)", + }, { metadata: { sourceURL: "https://example.com/data-image" }, content: "![data image](data:image/png;base64,...)", diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e1bd425..fbfaa7b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -1,4 +1,4 @@ -import { Document } from "../../lib/entities"; +import { Document, PageOptions, WebScraperOptions } from "../../lib/entities"; import { Progress } from "../../lib/entities"; import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; @@ -6,19 +6,7 @@ import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; -export type WebScraperOptions = { - urls: string[]; - mode: "single_urls" | "sitemap" | "crawl"; - crawlerOptions?: { - returnOnlyUrls?: boolean; - includes?: string[]; - excludes?: string[]; - maxCrawledLinks?: number; - limit?: number; - generateImgAltText?: boolean; - }; - concurrentRequests?: number; -}; + export class WebScraperDataProvider { private urls: string[] = [""]; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; @@ -29,6 +17,7 @@ export class WebScraperDataProvider { private limit: number = 10000; private concurrentRequests: number = 20; private generateImgAltText: boolean = false; + private pageOptions?: PageOptions; authorize(): void { throw new Error("Method not implemented."); @@ -51,7 +40,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true); + const result = await scrapSingleUrl(url, true, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -321,6 +310,7 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index f71221c..b97152d 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -2,9 +2,10 @@ import * as cheerio from "cheerio"; import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document } from "../../lib/entities"; +import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseTablesToMarkdown } from "./utils/parseTable"; +import { excludeNonMainTags } from "./utils/excludeTags"; // import puppeteer from "puppeteer"; dotenv.config(); @@ -77,14 +78,22 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - toMarkdown: boolean = true + toMarkdown: boolean = true, + pageOptions: PageOptions = { onlyMainContent: true } ): Promise { console.log(`Scraping URL: ${urlToScrap}`); urlToScrap = urlToScrap.trim(); - const removeUnwantedElements = (html: string) => { + const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); + if (pageOptions.onlyMainContent) { + // remove any other tags that are not in the main content + soup("").remove(); + excludeNonMainTags.forEach((tag) => { + soup(tag).remove(); + }); + } return soup.html(); }; @@ -133,7 +142,7 @@ export async function scrapSingleUrl( } break; } - let cleanedHtml = removeUnwantedElements(text); + let cleanedHtml = removeUnwantedElements(text, pageOptions); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts new file mode 100644 index 0000000..142bcef --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -0,0 +1,60 @@ +export const excludeNonMainTags = [ + "header", + "footer", + "nav", + "aside", + ".header", + ".top", + ".navbar", + "#header", + ".footer", + ".bottom", + "#footer", + ".sidebar", + ".side", + ".aside", + "#sidebar", + ".modal", + ".popup", + "#modal", + ".overlay", + ".ad", + ".ads", + ".advert", + "#ad", + ".lang-selector", + ".language", + "#language-selector", + ".social", + ".social-media", + ".social-links", + "#social", + ".menu", + ".navigation", + "#nav", + ".breadcrumbs", + "#breadcrumbs", + ".form", + "form", + "#search-form", + ".search", + "#search", + ".share", + "#share", + ".pagination", + "#pagination", + ".widget", + "#widget", + ".related", + "#related", + ".tag", + "#tag", + ".category", + "#category", + ".comment", + "#comment", + ".reply", + "#reply", + ".author", + "#author", +]; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index a3de049..2123e0c 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -20,7 +20,9 @@ export interface WebScraperOptions { url: string; mode: "crawl" | "single_urls" | "sitemap"; crawlerOptions: any; + pageOptions: any; team_id: string; } + From ca2bf9cc126fab7ee6efc99e3cf5c9c8cc6a1f21 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 18:27:08 -0700 Subject: [PATCH 023/102] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b97152d..fbcd923 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -89,7 +89,6 @@ export async function scrapSingleUrl( soup("script, style, iframe, noscript, meta, head").remove(); if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content - soup("").remove(); excludeNonMainTags.forEach((tag) => { soup(tag).remove(); }); From 2bed55a3b42133efa75df1f319b123b796b2a9e1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 19:05:28 -0700 Subject: [PATCH 024/102] Nick: --- apps/js-sdk/firecrawl/README.md | 14 ++++++++++---- apps/js-sdk/firecrawl/package-lock.json | 4 ++-- apps/python-sdk/README.md | 17 +++++++++-------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index 0757511..3f92c32 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -33,15 +33,18 @@ Here's an example of how to use the SDK with error handling: // Crawl a website const crawlUrl = 'https://mendable.ai'; - const crawlParams = { + const params = { crawlerOptions: { excludes: ['blog/'], includes: [], // leave empty for all pages limit: 1000, + }, + pageOptions: { + onlyMainContent: true } }; - const crawlResult = await app.crawlUrl(crawlUrl, crawlParams); + const crawlResult = await app.crawlUrl(crawlUrl, params); console.log(crawlResult); } catch (error) { @@ -83,18 +86,21 @@ To crawl a website with error handling, use the `crawlUrl` method. It takes the async function crawlExample() { try { const crawlUrl = 'https://example.com'; - const crawlParams = { + const params = { crawlerOptions: { excludes: ['blog/'], includes: [], // leave empty for all pages limit: 1000, + }, + pageOptions: { + onlyMainContent: true } }; const waitUntilDone = true; const timeout = 5; const crawlResult = await app.crawlUrl( crawlUrl, - crawlParams, + params, waitUntilDone, timeout ); diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 98fafc5..0497c6e 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.7", + "version": "0.0.9", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "0.0.7", + "version": "0.0.9", "license": "MIT", "dependencies": { "axios": "^1.6.8", diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 3ce405d..0a80202 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -30,14 +30,12 @@ scraped_data = app.scrape_url(url) # Crawl a website crawl_url = 'https://mendable.ai' -crawl_params = { - 'crawlerOptions': { - 'excludes': ['blog/*'], - 'includes': [], # leave empty for all pages - 'limit': 1000, +params = { + 'pageOptions': { + 'onlyMainContent': True } } -crawl_result = app.crawl_url(crawl_url, params=crawl_params) +crawl_result = app.crawl_url(crawl_url, params=params) ``` ### Scraping a URL @@ -57,14 +55,17 @@ The `wait_until_done` parameter determines whether the method should wait for th ```python crawl_url = 'https://example.com' -crawl_params = { +params = { 'crawlerOptions': { 'excludes': ['blog/*'], 'includes': [], # leave empty for all pages 'limit': 1000, + }, + 'pageOptions': { + 'onlyMainContent': True } } -crawl_result = app.crawl_url(crawl_url, params=crawl_params, wait_until_done=True, timeout=5) +crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) ``` If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. From d55c23ec07eb6baba8ec3b1df3676118c43dce08 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 21:50:41 -0700 Subject: [PATCH 025/102] Update README.md --- README.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index b35adfe..56f8c5c 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,9 @@ We provide an easy to use API with our hosted version. You can find the playgrou - [x] [API](https://firecrawl.dev/playground) - [x] [Python SDK](https://github.com/mendableai/firecrawl/tree/main/apps/python-sdk) +- [X] [Node SDK](https://github.com/mendableai/firecrawl/tree/main/apps/js-sdk) - [x] [Langchain Integration 🦜🔗](https://python.langchain.com/docs/integrations/document_loaders/firecrawl/) -- [x] [Llama Index Integration 🦙](https://docs.llamaindex.ai/en/stable/) -- [X] [JS SDK](https://github.com/mendableai/firecrawl/tree/main/apps/js-sdk) +- [x] [Llama Index Integration 🦙](https://docs.llamaindex.ai/en/latest/examples/data_connectors/WebPageDemo/#using-firecrawl-reader) - [ ] LangchainJS - Coming Soon @@ -63,15 +63,16 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ "total": 22, "data": [ { - "content": "Raw Content ", - "markdown": "# Markdown Content", - "provider": "web-scraper", - "metadata": { - "title": "Mendable | AI for CX and Sales", - "description": "AI for CX and Sales", - "language": null, - "sourceURL": "https://www.mendable.ai/", - } + "content": "Raw Content ", + "markdown": "# Markdown Content", + "provider": "web-scraper", + "metadata": { + "title": "Mendable | AI for CX and Sales", + "description": "AI for CX and Sales", + "language": null, + "sourceURL": "https://www.mendable.ai/", + } + } ] } ``` From e3a6bc4de7704c12187bf9c54bf2200a9cb2732e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 22:23:10 -0700 Subject: [PATCH 026/102] Create openapi.json --- apps/api/openapi.json | 295 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 apps/api/openapi.json diff --git a/apps/api/openapi.json b/apps/api/openapi.json new file mode 100644 index 0000000..bb58ae3 --- /dev/null +++ b/apps/api/openapi.json @@ -0,0 +1,295 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "Firecrawl API", + "version": "1.0.0", + "description": "API for interacting with Firecrawl services to convert websites to LLM-ready data.", + "contact": { + "name": "Firecrawl Support", + "url": "https://firecrawl.dev/support", + "email": "help@mendable.ai" + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ], + "paths": { + "/scrape": { + "post": { + "summary": "Scrape a single URL", + "operationId": "scrapeSingleUrl", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScrapeResponse" + } + } + } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" + } + } + } + }, + "/crawl": { + "post": { + "summary": "Crawl multiple URLs based on options", + "operationId": "crawlUrls", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "crawlerOptions": { + "type": "object", + "properties": { + "includes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to include" + }, + "excludes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to exclude" + }, + "generateImgAltText": { + "type": "boolean", + "description": "Generate alt text for images using LLMs (must have a paid plan)", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of pages to crawl" + } + } + }, + "pageOptions": { + "type": "object", + "properties": { + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": false + } + } + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlResponse" + } + } + } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" + } + } + } + }, + "/crawl/status/{jobId}": { + "get": { + "tags": ["Crawl"], + "summary": "Get the status of a crawl job", + "operationId": "getCrawlStatus", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "jobId", + "in": "path", + "description": "ID of the crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Status of the job (completed, active, failed, paused)" + }, + "current": { + "type": "integer", + "description": "Current page number" + }, + "current_url": { + "type": "string", + "description": "Current URL being scraped" + }, + "current_step": { + "type": "string", + "description": "Current step in the process" + }, + "total": { + "type": "integer", + "description": "Total number of pages" + }, + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScrapeResponse" + }, + "description": "Data returned from the job (null when it is in progress)" + } + } + } + } + } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" + } + } + } + } + }, + "components": { + "securitySchemes": { + "bearerAuth": { + "type": "http", + "scheme": "bearer" + } + }, + "schemas": { + "ScrapeResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + } + } + } + } + } + } + }, + "CrawlResponse": { + "type": "object", + "properties": { + "jobId": { + "type": "string" + } + } + } + } + }, + "security": [ + { + "bearerAuth": [] + } + ] + } + \ No newline at end of file From 57e5b360142087390c6061408ccc1eb448bc6e78 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 11:43:57 -0300 Subject: [PATCH 027/102] [Feat] Adding pdf parser --- apps/api/.env.local | 2 +- apps/api/package.json | 2 + apps/api/pnpm-lock.yaml | 26 ++++- apps/api/src/lib/entities.ts | 2 + apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 51 ++++++++- .../utils/__tests__/pdfProcessor.test.ts | 40 +++++++ .../scraper/WebScraper/utils/pdfProcessor.ts | 108 ++++++++++++++++++ 8 files changed, 224 insertions(+), 9 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts diff --git a/apps/api/.env.local b/apps/api/.env.local index 301c64b..852c5ed 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -10,4 +10,4 @@ OPENAI_API_KEY= BULL_AUTH_KEY= LOGTAIL_KEY= PLAYWRIGHT_MICROSERVICE_URL= - +LLAMAPARSE_API_KEY= diff --git a/apps/api/package.json b/apps/api/package.json index e8e5e02..1d12a96 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -60,6 +60,7 @@ "date-fns": "^2.29.3", "dotenv": "^16.3.1", "express-rate-limit": "^6.7.0", + "form-data": "^4.0.0", "glob": "^10.3.12", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.3.2", @@ -73,6 +74,7 @@ "mongoose": "^8.0.3", "natural": "^6.3.0", "openai": "^4.28.4", + "pdf-parse": "^1.1.1", "pos": "^0.4.2", "promptable": "^0.0.9", "puppeteer": "^22.6.3", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 8142189..fd0ffa0 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -68,6 +68,9 @@ dependencies: express-rate-limit: specifier: ^6.7.0 version: 6.11.2(express@4.18.3) + form-data: + specifier: ^4.0.0 + version: 4.0.0 glob: specifier: ^10.3.12 version: 10.3.12 @@ -82,7 +85,7 @@ dependencies: version: 0.0.25 langchain: specifier: ^0.1.25 - version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) + version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -107,6 +110,9 @@ dependencies: openai: specifier: ^4.28.4 version: 4.28.4 + pdf-parse: + specifier: ^1.1.1 + version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -2498,7 +2504,6 @@ packages: dependencies: ms: 2.1.3 supports-color: 5.5.0 - dev: true /debug@4.3.4: resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==} @@ -3997,7 +4002,7 @@ packages: engines: {node: '>=6'} dev: true - /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): + /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==} engines: {node: '>=18'} peerDependencies: @@ -4174,6 +4179,7 @@ packages: ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 + pdf-parse: 1.1.1 puppeteer: 22.6.3(typescript@5.4.2) redis: 4.6.13 uuid: 9.0.1 @@ -4653,6 +4659,10 @@ packages: resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} engines: {node: '>=10.5.0'} + /node-ensure@0.0.0: + resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==} + dev: false + /node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -4951,6 +4961,16 @@ packages: /path-to-regexp@0.1.7: resolution: {integrity: sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==} + /pdf-parse@1.1.1: + resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==} + engines: {node: '>=6.8.1'} + dependencies: + debug: 3.2.7(supports-color@5.5.0) + node-ensure: 0.0.0 + transitivePeerDependencies: + - supports-color + dev: false + /pend@1.2.0: resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==} dev: false diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index c332914..d608756 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -39,6 +39,7 @@ export class Document { [key: string]: any; }; childrenLinks?: string[]; + provider?: string; constructor(data: Partial) { if (!data.content) { @@ -51,5 +52,6 @@ export class Document { this.metadata = data.metadata || { sourceURL: "" }; this.markdown = data.markdown || ""; this.childrenLinks = data.childrenLinks || undefined; + this.provider = data.provider || undefined; } } diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 886efab..23cb629 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -257,7 +257,7 @@ export class WebCrawler { ".js", ".ico", ".svg", - ".pdf", + // ".pdf", ".zip", ".exe", ".dmg", diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fbfaa7b..4dda41c 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -5,6 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; export class WebScraperDataProvider { @@ -65,7 +66,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (!useCaching) { + if (true) { // !useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], @@ -75,7 +76,7 @@ export class WebScraperDataProvider { limit: this.limit, generateImgAltText: this.generateImgAltText, }); - const links = await crawler.start(inProgress, 5, this.limit); + let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { return links.map((url) => ({ content: "", @@ -84,12 +85,27 @@ export class WebScraperDataProvider { type: "text", })); } + + let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfDocuments: Document[] = []; + for (let pdfLink of pdfLinks) { + const pdfContent = await fetchAndProcessPdf(pdfLink); + pdfDocuments.push({ + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web", + type: "text", + }); + } + links = links.filter((link) => !link.endsWith(".pdf")); + let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } + documents = documents.concat(pdfDocuments); // CACHING DOCUMENTS // - parent document @@ -134,8 +150,20 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { + let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf")); + let pdfDocuments: Document[] = []; + for (let pdfLink of pdfLinks) { + const pdfContent = await fetchAndProcessPdf(pdfLink); + pdfDocuments.push({ + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web", + type: "text", + }); + } + let documents = await this.convertUrlsToDocuments( - this.urls, + this.urls.filter((link) => !link.endsWith(".pdf")), inProgress ); documents = this.replaceImgPathsWithAbsolutePaths(documents); @@ -144,6 +172,7 @@ export class WebScraperDataProvider { } const baseUrl = new URL(this.urls[0]).origin; documents = await this.getSitemapData(baseUrl, documents); + documents = documents.concat(pdfDocuments); await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); @@ -151,7 +180,20 @@ export class WebScraperDataProvider { return documents; } if (this.mode === "sitemap") { - const links = await getLinksFromSitemap(this.urls[0]); + let links = await getLinksFromSitemap(this.urls[0]); + let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfDocuments: Document[] = []; + for (let pdfLink of pdfLinks) { + const pdfContent = await fetchAndProcessPdf(pdfLink); + pdfDocuments.push({ + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web", + type: "text", + }); + } + links = links.filter((link) => !link.endsWith(".pdf")); + let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), inProgress @@ -162,6 +204,7 @@ export class WebScraperDataProvider { if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } + documents = documents.concat(pdfDocuments); await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts new file mode 100644 index 0000000..7d25aec --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -0,0 +1,40 @@ +import * as pdfProcessor from '../pdfProcessor'; + +describe('PDF Processing Module - Integration Test', () => { + it('should download and read a simple PDF file by URL', async () => { + const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); + expect(pdfContent).toEqual("Dummy PDF file"); + }); + + it('should download and read a complex PDF file by URL', async () => { + const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); + + const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + + ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + + ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + + ' Nick Barnes h, Ajmal Mian i\n' + + ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + + ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + + ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + + ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + + ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + + ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + + ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + + ' hAustralian National University (ANU), Canberra, Australia\n' + + ' iThe University of Western Australia (UWA), Perth, Australia\n' + + ' Abstract\n' + + ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + + ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + + ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + + ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + + ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + + ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + + ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + + ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + + ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + + ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + + ' extensive informative summaries of the existing works to advance the LLM research.\n' + expect(pdfContent).toContain(expectedContent); + }, 60000); + +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts new file mode 100644 index 0000000..fb08d9c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -0,0 +1,108 @@ +import axios, { AxiosResponse } from "axios"; +import fs from "fs"; +import { createReadStream, createWriteStream } from "node:fs"; +import FormData from "form-data"; +import dotenv from "dotenv"; +import pdf from "pdf-parse"; +import path from "path"; +import os from "os"; + +dotenv.config(); + +export async function fetchAndProcessPdf(url: string): Promise { + const tempFilePath = await downloadPdf(url); + const content = await processPdfToText(tempFilePath); + fs.unlinkSync(tempFilePath); // Clean up the temporary file + return content; +} + +async function downloadPdf(url: string): Promise { + const response = await axios({ + url, + method: 'GET', + responseType: 'stream', + }); + + const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); + const writer = createWriteStream(tempFilePath); + + response.data.pipe(writer); + + return new Promise((resolve, reject) => { + writer.on('finish', () => resolve(tempFilePath)); + writer.on('error', reject); + }); +} + +export async function processPdfToText(filePath: string): Promise { + let content = ""; + + if (process.env.LLAMAPARSE_API_KEY) { + const apiKey = process.env.LLAMAPARSE_API_KEY; + const headers = { + Authorization: `Bearer ${apiKey}`, + }; + const base_url = "https://api.cloud.llamaindex.ai/api/parsing"; + const fileType2 = "application/pdf"; + + try { + const formData = new FormData(); + formData.append("file", createReadStream(filePath), { + filename: filePath, + contentType: fileType2, + }); + + const uploadUrl = `${base_url}/upload`; + const uploadResponse = await axios.post(uploadUrl, formData, { + headers: { + ...headers, + ...formData.getHeaders(), + }, + }); + + const jobId = uploadResponse.data.id; + const resultType = "text"; + const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`; + + let resultResponse: AxiosResponse; + let attempt = 0; + const maxAttempts = 10; // Maximum number of attempts + let resultAvailable = false; + + while (attempt < maxAttempts && !resultAvailable) { + try { + resultResponse = await axios.get(resultUrl, { headers }); + if (resultResponse.status === 200) { + resultAvailable = true; // Exit condition met + } else { + // If the status code is not 200, increment the attempt counter and wait + attempt++; + await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds + } + } catch (error) { + console.error("Error fetching result:", error); + attempt++; + await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying + // You may want to handle specific errors differently + } + } + + if (!resultAvailable) { + content = await processPdf(filePath); + } + content = resultResponse.data[resultType]; + } catch (error) { + console.error("Error processing document:", filePath, error); + content = await processPdf(filePath); + } + } else { + content = await processPdf(filePath); + } + return content; +} + +async function processPdf(file: string){ + const fileContent = fs.readFileSync(file); + const data = await pdf(fileContent); + return data.text; +} \ No newline at end of file From 704a059448729189948f4168550217c4bae3efde Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 13:53:11 -0300 Subject: [PATCH 028/102] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 4dda41c..3c6e414 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -66,7 +66,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (true) { // !useCaching) { + if (!useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], From c4cc4b9262945515f83fffe0d343a64cc500d57a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 14:12:39 -0300 Subject: [PATCH 029/102] fixing document response --- apps/api/src/scraper/WebScraper/index.ts | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 3c6e414..551c8d8 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -93,8 +93,7 @@ export class WebScraperDataProvider { pdfDocuments.push({ content: pdfContent, metadata: { sourceURL: pdfLink }, - provider: "web", - type: "text", + provider: "web-scraper" }); } links = links.filter((link) => !link.endsWith(".pdf")); @@ -157,8 +156,7 @@ export class WebScraperDataProvider { pdfDocuments.push({ content: pdfContent, metadata: { sourceURL: pdfLink }, - provider: "web", - type: "text", + provider: "web-scraper" }); } @@ -188,8 +186,7 @@ export class WebScraperDataProvider { pdfDocuments.push({ content: pdfContent, metadata: { sourceURL: pdfLink }, - provider: "web", - type: "text", + provider: "web-scraper" }); } links = links.filter((link) => !link.endsWith(".pdf")); From 6112cc1c2c640d3a2e9bcffb56dc835777c8e6bb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Apr 2024 10:34:41 -0700 Subject: [PATCH 030/102] Update index.ts --- apps/api/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index fad8fa3..7a34214 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -87,7 +87,7 @@ async function authenticateUser(req, res, mode?: string): Promise<{ success: boo app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser(req, res, "crawl"); + const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); if (!success) { return res.status(status).json({ error }); } From 0f7ab4107f094e6002fc29eb5d56fa4ca2d79ad7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Apr 2024 10:41:06 -0700 Subject: [PATCH 031/102] Update index.ts --- apps/api/src/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 7a34214..db7f45c 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -16,6 +16,7 @@ const { ExpressAdapter } = require("@bull-board/express"); export const app = express(); + global.isProduction = process.env.IS_PRODUCTION === "true"; app.use(bodyParser.urlencoded({ extended: true })); From 3e9e24aaf137db4653f5ffd149e78390c9f76782 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Apr 2024 11:01:24 -0700 Subject: [PATCH 032/102] Update index.ts --- apps/api/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index db7f45c..b586cd9 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -164,7 +164,7 @@ app.post("/v0/scrape", async (req, res) => { app.post("/v0/crawl", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, "crawl"); if (!success) { return res.status(status).json({ error }); } From ddb3b25171988590d86eb0600e18ffa9139490f8 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 16:28:01 -0300 Subject: [PATCH 033/102] adding ci-cd workflow --- apps/api/.env.local | 1 + apps/api/package.json | 2 +- apps/api/pnpm-lock.yaml | 2 +- apps/api/src/__tests__/e2e/index.test.ts | 170 +++++++++++++++++++++++ apps/api/src/index.ts | 14 +- 5 files changed, 177 insertions(+), 12 deletions(-) create mode 100644 apps/api/src/__tests__/e2e/index.test.ts diff --git a/apps/api/.env.local b/apps/api/.env.local index 852c5ed..f5c625f 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -11,3 +11,4 @@ BULL_AUTH_KEY= LOGTAIL_KEY= PLAYWRIGHT_MICROSERVICE_URL= LLAMAPARSE_API_KEY= +TEST_API_KEY= \ No newline at end of file diff --git a/apps/api/package.json b/apps/api/package.json index 1d12a96..cbce4be 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -26,7 +26,7 @@ "@types/bull": "^4.10.0", "@types/cors": "^2.8.13", "@types/express": "^4.17.17", - "@types/jest": "^29.5.6", + "@types/jest": "^29.5.12", "body-parser": "^1.20.1", "express": "^4.18.2", "jest": "^29.6.3", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index fd0ffa0..df669d5 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -179,7 +179,7 @@ devDependencies: specifier: ^4.17.17 version: 4.17.21 '@types/jest': - specifier: ^29.5.6 + specifier: ^29.5.12 version: 29.5.12 body-parser: specifier: ^1.20.1 diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts new file mode 100644 index 0000000..ce11207 --- /dev/null +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -0,0 +1,170 @@ +import request from 'supertest'; +import { app } from '../../index'; +import dotenv from 'dotenv'; + +dotenv.config(); +const TEST_URL = 'http://localhost:3002' + +describe('E2E Tests for API Routes', () => { + describe('GET /', () => { + it('should return Hello, world! message', async () => { + const response = await request(TEST_URL).get('/'); + expect(response.statusCode).toBe(200); + expect(response.text).toContain('SCRAPERS-JS: Hello, world! Fly.io'); + }); + }); + + describe('GET /test', () => { + it('should return Hello, world! message', async () => { + const response = await request(TEST_URL).get('/test'); + expect(response.statusCode).toBe(200); + expect(response.text).toContain('Hello, world!'); + }); + }); + + describe('POST /v0/scrape', () => { + it('should require authorization', async () => { + const response = await request(app).post('/v0/scrape'); + expect(response.statusCode).toBe(401); + }); + + it('should return an error response with an invalid API key', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer invalid-api-key`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(401); + }); + + it('should return a successful response with a valid API key', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('🔥 FireCrawl'); + }, 30000); // 30 seconds timeout + }); + + describe('POST /v0/crawl', () => { + it('should require authorization', async () => { + const response = await request(TEST_URL).post('/v0/crawl'); + expect(response.statusCode).toBe(401); + }); + + it('should return an error response with an invalid API key', async () => { + const response = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer invalid-api-key`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(401); + }); + + it('should return a successful response with a valid API key', async () => { + const response = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('jobId'); + expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); + }); + + // Additional tests for insufficient credits? + }); + + describe('POST /v0/crawlWebsitePreview', () => { + it('should require authorization', async () => { + const response = await request(TEST_URL).post('/v0/crawlWebsitePreview'); + expect(response.statusCode).toBe(401); + }); + + it('should return an error response with an invalid API key', async () => { + const response = await request(TEST_URL) + .post('/v0/crawlWebsitePreview') + .set('Authorization', `Bearer invalid-api-key`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(401); + }); + + it('should return a successful response with a valid API key', async () => { + const response = await request(TEST_URL) + .post('/v0/crawlWebsitePreview') + .set('Authorization', `Bearer this_is_just_a_preview_token`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('jobId'); + expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); + }); + }); + + describe('GET /v0/crawl/status/:jobId', () => { + it('should require authorization', async () => { + const response = await request(TEST_URL).get('/v0/crawl/status/123'); + expect(response.statusCode).toBe(401); + }); + + it('should return an error response with an invalid API key', async () => { + const response = await request(TEST_URL) + .get('/v0/crawl/status/123') + .set('Authorization', `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it('should return Job not found for invalid job ID', async () => { + const response = await request(TEST_URL) + .get('/v0/crawl/status/invalidJobId') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it('should return a successful response for a valid crawl job', async () => { + const crawlResponse = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(crawlResponse.statusCode).toBe(200); + + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + expect(response.body.status).toBe('active'); + + setTimeout(async () => { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + expect(response.body.status).toBe('completed'); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('🔥 FireCrawl'); + }, 30000); // 30 seconds + }, 60000); // 60 seconds + }); + + describe('GET /is-production', () => { + it('should return the production status', async () => { + const response = await request(TEST_URL).get('/is-production'); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('isProduction'); + }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index b586cd9..98be945 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -230,16 +230,10 @@ app.post("/v0/crawl", async (req, res) => { }); app.post("/v0/crawlWebsitePreview", async (req, res) => { try { - // make sure to authenticate user first, Bearer - const authHeader = req.headers.authorization; - if (!authHeader) { - return res.status(401).json({ error: "Unauthorized" }); - } - const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " - if (!token) { - return res.status(401).json({ error: "Unauthorized: Token missing" }); - } - + const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + if (!success) { + return res.status(status).json({ error }); + } // authenticate on supabase const url = req.body.url; if (!url) { From 42de846013f11f10185d4ad1ef9ff0c99ccd3c25 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 16:41:15 -0300 Subject: [PATCH 034/102] adding workflow --- .github/workflows/ci.yml | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..258542e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: CI/CD +on: + pull_request: + branches: + - main + # schedule: + # - cron: '0 */4 * * *' + +jobs: + pre-deploy: + name: Pre-deploy checks + runs-on: ubuntu-latest + services: + redis: + image: redis + ports: + - 6379:6379 + steps: + - uses: actions/checkout@v3 + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version: '20' + - name: Install pnpm + run: npm install -g pnpm + - name: Install dependencies + run: pnpm install + working-directory: ./apps/api + - name: Start the application + run: npm start & + working-directory: ./apps/api + - name: Start workers + run: npm run workers & + working-directory: ./apps/api + - name: Run E2E tests + run: npx jest -- src/__tests__/e2e/index.test.ts + working-directory: ./apps/api + env: + REDIS_URL: redis://localhost:6379 \ No newline at end of file From d7c797d0f3ff48a3066cf26d59f88a4ddb5f0ed0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 16:49:31 -0300 Subject: [PATCH 035/102] adding env secrets --- .github/workflows/ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 258542e..69699ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,4 +36,9 @@ jobs: run: npx jest -- src/__tests__/e2e/index.test.ts working-directory: ./apps/api env: - REDIS_URL: redis://localhost:6379 \ No newline at end of file + REDIS_URL: redis://localhost:6379 + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} + TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + From dcccfab4cdd4a06a8d27523f7063eb33e80c52af Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 16:55:49 -0300 Subject: [PATCH 036/102] adding other env secrets --- .github/workflows/ci.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 69699ed..731b30d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,9 +36,21 @@ jobs: run: npx jest -- src/__tests__/e2e/index.test.ts working-directory: ./apps/api env: - REDIS_URL: redis://localhost:6379 + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + HOST: ${{ secrets.HOST }} + LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} + LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} + NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} + PORT: ${{ secrets.PORT }} + REDIS_URL: ${{ secrets.REDIS_URL }} + SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} + SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + From 2ca00d81866063aef302fa0960d68889c4aee9c6 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:03:52 -0300 Subject: [PATCH 037/102] adding openHandlesTimeout --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 731b30d..6017a54 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ jobs: run: npm run workers & working-directory: ./apps/api - name: Run E2E tests - run: npx jest -- src/__tests__/e2e/index.test.ts + run: npx jest --openHandlesTimeout=60000 -- src/__tests__/e2e/index.test.ts working-directory: ./apps/api env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} From dae024adc17386118a5a9e7d93b9e5fe244af9c9 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:08:48 -0300 Subject: [PATCH 038/102] fixing env position --- .github/workflows/ci.yml | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6017a54..86ccc12 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,24 @@ on: # schedule: # - cron: '0 */4 * * *' +env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + HOST: ${{ secrets.HOST }} + LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} + LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} + NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} + PORT: ${{ secrets.PORT }} + REDIS_URL: ${{ secrets.REDIS_URL }} + SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} + SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} + SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + jobs: pre-deploy: name: Pre-deploy checks @@ -35,22 +53,6 @@ jobs: - name: Run E2E tests run: npx jest --openHandlesTimeout=60000 -- src/__tests__/e2e/index.test.ts working-directory: ./apps/api - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - HOST: ${{ secrets.HOST }} - LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} - LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} - NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} - PORT: ${{ secrets.PORT }} - REDIS_URL: ${{ secrets.REDIS_URL }} - SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} - SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} - SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + From efbb4e8905782dbe347763750091d72215ac239c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:18:15 -0300 Subject: [PATCH 039/102] fixing jest parameters --- .github/workflows/ci.yml | 2 +- apps/api/src/__tests__/e2e/index.test.ts | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 86ccc12..8c85782 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,7 +51,7 @@ jobs: run: npm run workers & working-directory: ./apps/api - name: Run E2E tests - run: npx jest --openHandlesTimeout=60000 -- src/__tests__/e2e/index.test.ts + run: npx jest --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts working-directory: ./apps/api diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index ce11207..3bc3e99 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -144,19 +144,20 @@ describe('E2E Tests for API Routes', () => { expect(response.body).toHaveProperty('status'); expect(response.body.status).toBe('active'); - setTimeout(async () => { - const response = await request(TEST_URL) + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('status'); - expect(response.body.status).toBe('completed'); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('🔥 FireCrawl'); - }, 30000); // 30 seconds + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty('status'); + expect(completedResponse.body.status).toBe('completed'); + expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body.data).toHaveProperty('content'); + expect(completedResponse.body.data).toHaveProperty('markdown'); + expect(completedResponse.body.data).toHaveProperty('metadata'); + expect(completedResponse.body.data.content).toContain('🔥 FireCrawl'); }, 60000); // 60 seconds }); From 3f833737f3cafe502bd394fde524aa8ac0bfd77c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:25:25 -0300 Subject: [PATCH 040/102] fixing test --- apps/api/src/__tests__/e2e/index.test.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 3bc3e99..608d887 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -154,10 +154,10 @@ describe('E2E Tests for API Routes', () => { expect(completedResponse.body).toHaveProperty('status'); expect(completedResponse.body.status).toBe('completed'); expect(completedResponse.body).toHaveProperty('data'); - expect(completedResponse.body.data).toHaveProperty('content'); - expect(completedResponse.body.data).toHaveProperty('markdown'); - expect(completedResponse.body.data).toHaveProperty('metadata'); - expect(completedResponse.body.data.content).toContain('🔥 FireCrawl'); + expect(completedResponse.body.data[0]).toHaveProperty('content'); + expect(completedResponse.body.data[0]).toHaveProperty('markdown'); + expect(completedResponse.body.data[0]).toHaveProperty('metadata'); + expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl'); }, 60000); // 60 seconds }); From ca8c8b87dcf242da776af24ded2762a1ae12b8d6 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:29:06 -0300 Subject: [PATCH 041/102] fixing workflow --- .github/workflows/ci.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8c85782..042d0c3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,12 +47,17 @@ jobs: - name: Start the application run: npm start & working-directory: ./apps/api + id: start_app - name: Start workers run: npm run workers & working-directory: ./apps/api + id: start_workers - name: Run E2E tests - run: npx jest --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts + run: | + npx jest --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts working-directory: ./apps/api - - - + - name: Kill background processes + if: always() + run: | + kill $(jobs -p) + working-directory: ./apps/api \ No newline at end of file From 4f1179db99daf45a1f7d25e92beb735b99eb3b9d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:32:28 -0300 Subject: [PATCH 042/102] fixing workflow --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 042d0c3..eafdf6f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,7 +54,7 @@ jobs: id: start_workers - name: Run E2E tests run: | - npx jest --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts + npx jest --detectOpenHandles --forceExit --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts working-directory: ./apps/api - name: Kill background processes if: always() From 4018d7ca17ec13642d196954dbba006a9cc28d0e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:35:00 -0300 Subject: [PATCH 043/102] fixing workflow --- .github/workflows/ci.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index eafdf6f..d039085 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,9 +55,4 @@ jobs: - name: Run E2E tests run: | npx jest --detectOpenHandles --forceExit --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts - working-directory: ./apps/api - - name: Kill background processes - if: always() - run: | - kill $(jobs -p) working-directory: ./apps/api \ No newline at end of file From dab0568c435e145cae69c47127477ed2e3478f2c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:38:12 -0300 Subject: [PATCH 044/102] testing tests --- apps/api/src/__tests__/e2e/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 608d887..b2af6f0 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -74,7 +74,7 @@ describe('E2E Tests for API Routes', () => { .set('Content-Type', 'application/json') .send({ url: 'https://firecrawl.dev' }); expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('jobId'); + expect(response.body).toHaveProperty('asudhaopsidjp'); expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); }); From c627d2217921e1e75ba992f2c8b82f03334ed35f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:41:23 -0300 Subject: [PATCH 045/102] all working now --- apps/api/src/__tests__/e2e/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index b2af6f0..608d887 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -74,7 +74,7 @@ describe('E2E Tests for API Routes', () => { .set('Content-Type', 'application/json') .send({ url: 'https://firecrawl.dev' }); expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('asudhaopsidjp'); + expect(response.body).toHaveProperty('jobId'); expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); }); From 1c8ffc9faa5c9f612070cc9ac0b4e7d2dffe0f34 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:44:33 -0300 Subject: [PATCH 046/102] fixing deploy workflow --- .github/workflows/fly.yml | 50 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index df29d1b..3bb215b 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -6,10 +6,60 @@ on: # schedule: # - cron: '0 */4 * * *' +env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + HOST: ${{ secrets.HOST }} + LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} + LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} + NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} + PORT: ${{ secrets.PORT }} + REDIS_URL: ${{ secrets.REDIS_URL }} + SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} + SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} + SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + jobs: + pre-deploy: + name: Pre-deploy checks + runs-on: ubuntu-latest + services: + redis: + image: redis + ports: + - 6379:6379 + steps: + - uses: actions/checkout@v3 + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version: '20' + - name: Install pnpm + run: npm install -g pnpm + - name: Install dependencies + run: pnpm install + working-directory: ./apps/api + - name: Start the application + run: npm start & + working-directory: ./apps/api + id: start_app + - name: Start workers + run: npm run workers & + working-directory: ./apps/api + id: start_workers + - name: Run E2E tests + run: | + npx jest --detectOpenHandles --forceExit --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts + working-directory: ./apps/api deploy: name: Deploy app runs-on: ubuntu-latest + needs: pre-deploy steps: - uses: actions/checkout@v3 - name: Change directory From 70497804268d4624f974072f2f3a2370ef6cb902 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:47:15 -0300 Subject: [PATCH 047/102] adding all tests --- .github/workflows/ci.yml | 2 +- .github/workflows/fly.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d039085..b9a5b79 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,5 +54,5 @@ jobs: id: start_workers - name: Run E2E tests run: | - npx jest --detectOpenHandles --forceExit --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts + npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false working-directory: ./apps/api \ No newline at end of file diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 3bb215b..fe042c6 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -54,7 +54,7 @@ jobs: id: start_workers - name: Run E2E tests run: | - npx jest --detectOpenHandles --forceExit --openHandlesTimeout=60000 --watchAll=false -- src/__tests__/e2e/index.test.ts + npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false working-directory: ./apps/api deploy: name: Deploy app From be35b32306e5f0e1e98e5821983db308cda7efa4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Apr 2024 13:55:55 -0700 Subject: [PATCH 048/102] Nick: preview token tests --- apps/api/src/__tests__/e2e/index.test.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index ce11207..7a7600f 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -36,6 +36,14 @@ describe('E2E Tests for API Routes', () => { .send({ url: 'https://firecrawl.dev' }); expect(response.statusCode).toBe(401); }); + it('should return a successful response with a valid preview token', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer this_is_just_a_preview_token`) + .set('Content-Type', 'application/json') + .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(200); + }); it('should return a successful response with a valid API key', async () => { const response = await request(TEST_URL) From 72e1dadccd33214e3a25b92a41c15a680847dd11 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 11:47:20 -0300 Subject: [PATCH 049/102] adding option to replace all relative paths with absolute paths --- apps/api/src/lib/entities.ts | 1 + .../WebScraper/__tests__/index.test.ts | 179 ------------------ apps/api/src/scraper/WebScraper/index.ts | 63 +++--- .../utils/__tests__/pdfProcessor.test.ts | 69 ++++--- .../utils/__tests__/replacePaths.test.ts | 114 +++++++++++ .../scraper/WebScraper/utils/replacePaths.ts | 80 ++++++++ apps/api/src/services/queue-worker.ts | 1 - 7 files changed, 257 insertions(+), 250 deletions(-) delete mode 100644 apps/api/src/scraper/WebScraper/__tests__/index.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/replacePaths.ts diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index d608756..e261dd4 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -22,6 +22,7 @@ export type WebScraperOptions = { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + replaceAllPathsWithAbsolutePaths?: boolean; }; pageOptions?: PageOptions; concurrentRequests?: number; diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts deleted file mode 100644 index 42d9513..0000000 --- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts +++ /dev/null @@ -1,179 +0,0 @@ -import { WebScraperDataProvider } from "../index"; - -describe("WebScraperDataProvider", () => { - describe("replaceImgPathsWithAbsolutePaths", () => { - it("should replace image paths with absolute paths", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](./another-image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](./another-image.webp)", - }, - { - metadata: { sourceURL: "https://example.com/data-image" }, - content: "![data image](data:image/png;base64,...)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](https://example.com/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](https://example.com/another-image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](https://example.com/another-image.webp)", - }, - { - metadata: { sourceURL: "https://example.com/data-image" }, - content: "![data image](data:image/png;base64,...)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - - it("should handle absolute URLs without modification", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](https://example.com/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "![another alt text](http://anotherexample.com/another-image.png)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](https://example.com/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "![another alt text](http://anotherexample.com/another-image.png)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - - it("should not replace non-image content within the documents", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](./another-image.png) Here is some **bold text**.", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - it("should replace multiple image paths within the documents", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - - it("should replace image paths within the documents with complex URLs", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page/subpage" }, - content: - "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page/subpage" }, - content: - "Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page/subpage" }, - content: - "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page/subpage" }, - content: - "Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - }); -}); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 551c8d8..c2146be 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -6,6 +6,7 @@ import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; +import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; export class WebScraperDataProvider { @@ -19,6 +20,7 @@ export class WebScraperDataProvider { private concurrentRequests: number = 20; private generateImgAltText: boolean = false; private pageOptions?: PageOptions; + private replaceAllPathsWithAbsolutePaths?: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -100,7 +102,13 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); - documents = this.replaceImgPathsWithAbsolutePaths(documents); + + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } else { + documents = replaceImgPathsWithAbsolutePaths(documents); + } + if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -164,7 +172,13 @@ export class WebScraperDataProvider { this.urls.filter((link) => !link.endsWith(".pdf")), inProgress ); - documents = this.replaceImgPathsWithAbsolutePaths(documents); + + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } else { + documents = replaceImgPathsWithAbsolutePaths(documents); + } + if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -197,7 +211,13 @@ export class WebScraperDataProvider { ); documents = await this.getSitemapData(this.urls[0], documents); - documents = this.replaceImgPathsWithAbsolutePaths(documents); + + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } else { + documents = replaceImgPathsWithAbsolutePaths(documents); + } + if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -351,6 +371,7 @@ export class WebScraperDataProvider { this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); @@ -436,40 +457,4 @@ export class WebScraperDataProvider { return documents; }; - - replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { - try { - documents.forEach((document) => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = - document.content.match( - /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g - ) || []; - - images.forEach((image: string) => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; - - if (!imageUrl.startsWith("data:image")) { - if (!imageUrl.startsWith("http")) { - if (imageUrl.startsWith("/")) { - imageUrl = imageUrl.substring(1); - } - imageUrl = new URL(imageUrl, baseUrl).toString(); - } - } - - document.content = document.content.replace( - image, - `![${altText}](${imageUrl})` - ); - }); - }); - - return documents; - } catch (error) { - console.error("Error replacing img paths with absolute paths", error); - return documents; - } - }; } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index 7d25aec..f14c8d4 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -1,40 +1,47 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { - it('should download and read a simple PDF file by URL', async () => { + it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { + delete process.env.LLAMAPARSE_API_KEY; const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); - expect(pdfContent).toEqual("Dummy PDF file"); + expect(pdfContent.trim()).toEqual("Dummy PDF file"); }); - it('should download and read a complex PDF file by URL', async () => { - const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); +// We're hitting the LLAMAPARSE rate limit 🫠 +// it('should download and read a simple PDF file by URL', async () => { +// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); +// expect(pdfContent).toEqual("Dummy PDF file"); +// }); - const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + - ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + - ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + - ' Nick Barnes h, Ajmal Mian i\n' + - ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + - ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + - ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + - ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + - ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + - ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + - ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + - ' hAustralian National University (ANU), Canberra, Australia\n' + - ' iThe University of Western Australia (UWA), Perth, Australia\n' + - ' Abstract\n' + - ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + - ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + - ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + - ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + - ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + - ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + - ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + - ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + - ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + - ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + - ' extensive informative summaries of the existing works to advance the LLM research.\n' - expect(pdfContent).toContain(expectedContent); - }, 60000); +// it('should download and read a complex PDF file by URL', async () => { +// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); + +// const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + +// ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + +// ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + +// ' Nick Barnes h, Ajmal Mian i\n' + +// ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + +// ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + +// ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + +// ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + +// ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + +// ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + +// ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + +// ' hAustralian National University (ANU), Canberra, Australia\n' + +// ' iThe University of Western Australia (UWA), Perth, Australia\n' + +// ' Abstract\n' + +// ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + +// ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + +// ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + +// ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + +// ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + +// ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + +// ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + +// ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + +// ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + +// ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + +// ' extensive informative summaries of the existing works to advance the LLM research.\n' +// expect(pdfContent).toContain(expectedContent); +// }, 60000); }); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts new file mode 100644 index 0000000..aae567c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -0,0 +1,114 @@ +import { Document } from "../../../../lib/entities"; +import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths"; + +describe('replacePaths', () => { + describe('replacePathsWithAbsolutePaths', () => { + it('should replace relative paths with absolute paths', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not alter absolute URLs', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(documents); // Expect no change + }); + + it('should not alter data URLs for images', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is an image: ![alt text]().' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(documents); // Expect no change + }); + + it('should handle multiple links and images correctly', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should correctly handle a mix of absolute and relative paths', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + }); + + describe('replaceImgPathsWithAbsolutePaths', () => { + it('should replace relative image paths with absolute paths', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here is an image: ![alt text](/path/to/image.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' + }]; + + const result = replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not alter data:image URLs', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'An image with a data URL: ![alt text]().' + }]; + + const result = replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(documents); // Expect no change + }); + + it('should handle multiple images with a mix of data and relative URLs', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).' + }]; + + const result = replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts new file mode 100644 index 0000000..d652611 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -0,0 +1,80 @@ +import { Document } from "../../../lib/entities"; + +export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { + try { + documents.forEach((document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const paths = + document.content.match( + /(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g + ) || []; + + paths.forEach((path: string) => { + const isImage = path.startsWith("!"); + let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); + let url = matchedUrl[1]; + + if (!url.startsWith("data:") && !url.startsWith("http")) { + if (url.startsWith("/")) { + url = url.substring(1); + } + url = new URL(url, baseUrl).toString(); + } + + const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; + if (isImage) { + document.content = document.content.replace( + path, + `${markdownLinkOrImageText}(${url})` + ); + } else { + document.content = document.content.replace( + path, + `${markdownLinkOrImageText}(${url})` + ); + } + }); + }); + + return documents; + } catch (error) { + console.error("Error replacing paths with absolute paths", error); + return documents; + } +}; + +export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + try { + documents.forEach((document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = + document.content.match( + /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g + ) || []; + + images.forEach((image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } + } + + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }); + }); + + return documents; + } catch (error) { + console.error("Error replacing img paths with absolute paths", error); + return documents; + } +}; \ No newline at end of file diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index f3a971a..c9c5f73 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -3,7 +3,6 @@ import { getWebScraperQueue } from "./queue-service"; import "dotenv/config"; import { logtail } from "./logtail"; import { startWebScraperPipeline } from "../main/runWebScraper"; -import { WebScraperDataProvider } from "../scraper/WebScraper"; import { callWebhook } from "./webhook"; getWebScraperQueue().process( From 3ddff62a56d8201fb907b09dbb5e41b57f458623 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 14:49:35 -0300 Subject: [PATCH 050/102] adding better doc and types for js-sdk --- apps/js-sdk/firecrawl/src/index.ts | 108 +++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3d105e7..be55066 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -2,17 +2,60 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'; import dotenv from 'dotenv'; dotenv.config(); -interface FirecrawlAppConfig { +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { apiKey?: string | null; } -interface Params { +/** + * Generic parameter interface. + */ +export interface Params { [key: string]: any; } +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: any; + error?: string; +} + +/** + * Response interface for crawling operations. + */ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: any; + error?: string; +} + +/** + * Response interface for job status checks. + */ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: any; + error?: string; +} + +/** + * Main class for interacting with the Firecrawl API. + */ export default class FirecrawlApp { private apiKey: string; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ constructor({ apiKey = null }: FirecrawlAppConfig) { this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || ''; if (!this.apiKey) { @@ -20,7 +63,13 @@ export default class FirecrawlApp { } } - async scrapeUrl(url: string, params: Params | null = null): Promise { + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + async scrapeUrl(url: string, params: Params | null = null): Promise { const headers: AxiosRequestHeaders = { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, @@ -34,7 +83,7 @@ export default class FirecrawlApp { if (response.status === 200) { const responseData = response.data; if (responseData.success) { - return responseData.data; + return responseData; } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); } @@ -44,9 +93,18 @@ export default class FirecrawlApp { } catch (error: any) { throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; } - async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise { + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The response from the crawl operation. + */ + async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise { const headers = this.prepareHeaders(); let jsonData: Params = { url }; if (params) { @@ -59,7 +117,7 @@ export default class FirecrawlApp { if (waitUntilDone) { return this.monitorJobStatus(jobId, headers, timeout); } else { - return { jobId }; + return { success: true, jobId }; } } else { this.handleError(response, 'start crawl job'); @@ -68,9 +126,15 @@ export default class FirecrawlApp { console.log(error) throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; } - async checkCrawlStatus(jobId: string): Promise { + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + async checkCrawlStatus(jobId: string): Promise { const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers); @@ -82,8 +146,13 @@ export default class FirecrawlApp { } catch (error: any) { throw new Error(error.message); } + return { success: false, status: 'unknown', error: 'Internal server error.' }; } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ prepareHeaders(): AxiosRequestHeaders { return { 'Content-Type': 'application/json', @@ -91,14 +160,34 @@ export default class FirecrawlApp { } as AxiosRequestHeaders; } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise { return axios.post(url, data, { headers }); } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ getRequest(url: string, headers: AxiosRequestHeaders): Promise { return axios.get(url, { headers }); } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise { while (true) { const statusResponse: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers); @@ -124,6 +213,11 @@ export default class FirecrawlApp { } } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ handleError(response: AxiosResponse, action: string): void { if ([402, 409, 500].includes(response.status)) { const errorMessage: string = response.data.error || 'Unknown error occurred'; From 384fb1db1868bf2e3e2bf9c5c1e105216faa5ae8 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 15:27:54 -0300 Subject: [PATCH 051/102] updating version --- apps/js-sdk/firecrawl/build/index.js | 62 +++++++++++++++++++++++++++- apps/js-sdk/firecrawl/package.json | 2 +- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index be4223f..25ae999 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -10,13 +10,26 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge import axios from 'axios'; import dotenv from 'dotenv'; dotenv.config(); +/** + * Main class for interacting with the Firecrawl API. + */ export default class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ constructor({ apiKey = null }) { this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || ''; if (!this.apiKey) { throw new Error('No API key provided'); } } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ scrapeUrl(url_1) { return __awaiter(this, arguments, void 0, function* (url, params = null) { const headers = { @@ -32,7 +45,7 @@ export default class FirecrawlApp { if (response.status === 200) { const responseData = response.data; if (responseData.success) { - return responseData.data; + return responseData; } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); @@ -45,8 +58,17 @@ export default class FirecrawlApp { catch (error) { throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; }); } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The response from the crawl operation. + */ crawlUrl(url_1) { return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) { const headers = this.prepareHeaders(); @@ -62,7 +84,7 @@ export default class FirecrawlApp { return this.monitorJobStatus(jobId, headers, timeout); } else { - return { jobId }; + return { success: true, jobId }; } } else { @@ -73,8 +95,14 @@ export default class FirecrawlApp { console.log(error); throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; }); } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ checkCrawlStatus(jobId) { return __awaiter(this, void 0, void 0, function* () { const headers = this.prepareHeaders(); @@ -90,20 +118,45 @@ export default class FirecrawlApp { catch (error) { throw new Error(error.message); } + return { success: false, status: 'unknown', error: 'Internal server error.' }; }); } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ prepareHeaders() { return { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, }; } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ postRequest(url, data, headers) { return axios.post(url, data, { headers }); } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ getRequest(url, headers) { return axios.get(url, { headers }); } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ monitorJobStatus(jobId, headers, timeout) { return __awaiter(this, void 0, void 0, function* () { while (true) { @@ -134,6 +187,11 @@ export default class FirecrawlApp { } }); } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ handleError(response, action) { if ([402, 409, 500].includes(response.status)) { const errorMessage = response.data.error || 'Unknown error occurred'; diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 89e6d3f..58aa5ac 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.9", + "version": "0.0.10", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "type": "module", From a144e13e30a47dfb8fed8f9412247e1a8e5ba7b6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 12:23:13 -0700 Subject: [PATCH 052/102] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 8e2fe3b..5812f5d 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -6,7 +6,7 @@ const MAX_CRAWLS_PER_MINUTE_STARTER = 2; const MAX_CRAWLS_PER_MINUTE_STANDARD = 4; const MAX_CRAWLS_PER_MINUTE_SCALE = 20; -const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 40; +const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; From 37ef8a015c71c9fbf28c29cada97a8f211d740c7 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 17:55:35 -0300 Subject: [PATCH 053/102] fixing scrape preview test --- apps/api/src/__tests__/e2e/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 0c36511..554453b 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -43,7 +43,7 @@ describe('E2E Tests for API Routes', () => { .set('Content-Type', 'application/json') .send({ url: 'https://firecrawl.dev' }); expect(response.statusCode).toBe(200); - }); + }, 10000); // 10 seconds timeout it('should return a successful response with a valid API key', async () => { const response = await request(TEST_URL) From 890bde686f5bb7e94137a2a5b5aa51f1d999994d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 19:10:05 -0300 Subject: [PATCH 054/102] added type declarations --- apps/js-sdk/firecrawl/package.json | 3 +- apps/js-sdk/firecrawl/tsconfig.json | 8 +- apps/js-sdk/firecrawl/types/index.d.ts | 107 +++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 58aa5ac..811f87f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,8 +1,9 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.10", + "version": "0.0.11", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", + "types": "types/index.d.ts", "type": "module", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index 5bca86d..d7764a4 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -49,7 +49,7 @@ // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ /* Emit */ - // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ // "declarationMap": true, /* Create sourcemaps for d.ts files. */ // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ @@ -70,7 +70,7 @@ // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ - // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + "declarationDir": "./types", /* Specify the output directory for generated declaration files. */ // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ /* Interop Constraints */ @@ -105,5 +105,7 @@ /* Completeness */ // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ "skipLibCheck": true /* Skip type checking all .d.ts files. */ - } + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "**/__tests__/*"] } diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts new file mode 100644 index 0000000..a9d04ba --- /dev/null +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -0,0 +1,107 @@ +import { AxiosResponse, AxiosRequestHeaders } from 'axios'; +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { + apiKey?: string | null; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: any; + error?: string; +} +/** + * Response interface for crawling operations. + */ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: any; + error?: string; +} +/** + * Response interface for job status checks. + */ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: any; + error?: string; +} +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + private apiKey; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey }: FirecrawlAppConfig); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise; + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId: string): Promise; + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(): AxiosRequestHeaders; + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url: string, headers: AxiosRequestHeaders): Promise; + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise; + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void; +} From 389ac90f51339ad8da2396f170ebbdfcd6914fb7 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sat, 20 Apr 2024 09:19:09 -0700 Subject: [PATCH 055/102] Caleb: fixing some documentation and rebuilding the server --- CONTRIBUTING.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e11dae7..224eb57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,8 @@ # Contributing -We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. +We love contributions! Our contribution guide will be coming soon! + + + + From 6aa3cc3ce85c0d71fe6e0ae0e6f92fb007f04431 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 13:53:11 -0700 Subject: [PATCH 056/102] Nick: --- apps/api/src/main/runWebScraper.ts | 12 ++++++--- apps/api/src/services/logging/log_job.ts | 33 ++++++++++++++++++++++++ apps/api/src/services/queue-worker.ts | 19 +++++++++++++- apps/api/src/types.ts | 14 ++++++++++ 4 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/services/logging/log_job.ts diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c43b1b3..0f562a0 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -3,7 +3,7 @@ import { CrawlResult, WebScraperOptions } from "../types"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; - +import { Document } from "../lib/entities"; export async function startWebScraperPipeline({ job, }: { @@ -24,7 +24,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, - })) as { success: boolean; message: string; docs: CrawlResult[] }; + })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ url, @@ -76,12 +76,12 @@ export async function runWebScraper({ // remove docs with empty content const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); - onSuccess(filteredDocs); const { success, credit_usage } = await billTeam( team_id, filteredDocs.length ); + if (!success) { // throw new Error("Failed to bill team, no subscription was found"); return { @@ -91,7 +91,11 @@ export async function runWebScraper({ }; } - return { success: true, message: "", docs: filteredDocs as CrawlResult[] }; + // This is where the returnvalue from the job is set + onSuccess(filteredDocs); + + // this return doesn't matter too much for the job completion result + return { success: true, message: "", docs: filteredDocs }; } catch (error) { console.error("Error running web scraper", error); onError(error); diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts new file mode 100644 index 0000000..cb7e648 --- /dev/null +++ b/apps/api/src/services/logging/log_job.ts @@ -0,0 +1,33 @@ +import { supabase_service } from "../supabase"; +import { FirecrawlJob } from "../../types"; +import "dotenv/config"; + +export async function logJob(job: FirecrawlJob) { + try { + // Only log jobs in production + if (process.env.ENV !== "production") { + return; + } + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .insert([ + { + success: job.success, + message: job.message, + num_docs: job.num_docs, + docs: job.docs, + time_taken: job.time_taken, + team_id: job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.pageOptions, + }, + ]); + if (error) { + console.error("Error logging job:\n", error); + } + } catch (error) { + console.error("Error logging job:\n", error); + } +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c9c5f73..d436401 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -4,6 +4,7 @@ import "dotenv/config"; import { logtail } from "./logtail"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; +import { logJob } from "./logging/log_job"; getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), @@ -15,8 +16,11 @@ getWebScraperQueue().process( current_step: "SCRAPING", current_url: "", }); + const start = Date.now(); const { success, message, docs } = await startWebScraperPipeline({ job }); - + const end = Date.now(); + const timeTakenInSeconds = (end - start) / 1000; + const data = { success: success, result: { @@ -29,6 +33,19 @@ getWebScraperQueue().process( }; await callWebhook(job.data.team_id, data); + + await logJob({ + success: success, + message: message, + num_docs: docs.length, + docs: docs, + time_taken: timeTakenInSeconds, + team_id: job.data.team_id, + mode: "crawl", + url: job.data.url, + crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, + }); done(null, data); } catch (error) { if (error instanceof CustomError) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 2123e0c..7803d93 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,4 +25,18 @@ export interface WebScraperOptions { } +export interface FirecrawlJob { + success: boolean; + message: string; + num_docs: number; + docs: any[]; + time_taken: number; + team_id: string; + mode: string; + url: string; + crawlerOptions?: any; + pageOptions?: any; +} + + From 408c7a479f62dd0a50c72481c524a6a18d95432f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 14:02:22 -0700 Subject: [PATCH 057/102] Nick: rate limit fixes --- apps/api/src/index.ts | 16 +++++++++------- apps/api/src/services/rate-limiter.ts | 19 +++++++++++++++++-- apps/api/src/types.ts | 8 ++++++++ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 98be945..fcd26b7 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -9,6 +9,7 @@ import { WebScraperDataProvider } from "./scraper/WebScraper"; import { billTeam, checkTeamCredits } from "./services/billing/credit_billing"; import { getRateLimiter, redisClient } from "./services/rate-limiter"; import { parseApi } from "./lib/parseApi"; +import { RateLimiterMode } from "./types"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -46,7 +47,7 @@ app.get("/test", async (req, res) => { res.send("Hello, world!"); }); -async function authenticateUser(req, res, mode?: string): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { +async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { const authHeader = req.headers.authorization; if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; @@ -56,12 +57,13 @@ async function authenticateUser(req, res, mode?: string): Promise<{ success: boo return { success: false, error: "Unauthorized: Token missing", status: 401 }; } + + try { const incomingIP = (req.headers["x-forwarded-for"] || req.socket.remoteAddress) as string; const iptoken = incomingIP + token; - await getRateLimiter( - token === "this_is_just_a_preview_token" ? true : false + await getRateLimiter((token === "this_is_just_a_preview_token") ? RateLimiterMode.Preview : mode ).consume(iptoken); } catch (rateLimiterRes) { console.error(rateLimiterRes); @@ -88,7 +90,7 @@ async function authenticateUser(req, res, mode?: string): Promise<{ success: boo app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Scrape); if (!success) { return res.status(status).json({ error }); } @@ -164,7 +166,7 @@ app.post("/v0/scrape", async (req, res) => { app.post("/v0/crawl", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, "crawl"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); if (!success) { return res.status(status).json({ error }); } @@ -230,7 +232,7 @@ app.post("/v0/crawl", async (req, res) => { }); app.post("/v0/crawlWebsitePreview", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); if (!success) { return res.status(status).json({ error }); } @@ -259,7 +261,7 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => { app.get("/v0/crawl/status/:jobId", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.CrawlStatus); if (!success) { return res.status(status).json({ error }); } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5812f5d..dcd05da 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -1,5 +1,6 @@ import { RateLimiterRedis } from "rate-limiter-flexible"; import * as redis from "redis"; +import { RateLimiterMode } from "../../src/types"; const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; const MAX_CRAWLS_PER_MINUTE_STARTER = 2; @@ -8,6 +9,9 @@ const MAX_CRAWLS_PER_MINUTE_SCALE = 20; const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; +const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120; + + export const redisClient = redis.createClient({ @@ -29,6 +33,13 @@ export const serverRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); +export const crawlStatusRateLimiter = new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "middleware", + points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS, + duration: 60, // Duration in seconds +}); + export function crawlRateLimit(plan: string){ if(plan === "standard"){ @@ -56,9 +67,13 @@ export function crawlRateLimit(plan: string){ } -export function getRateLimiter(preview: boolean){ - if(preview){ + + +export function getRateLimiter(mode: RateLimiterMode){ + if(mode === RateLimiterMode.Preview){ return previewRateLimiter; + }else if(mode === RateLimiterMode.CrawlStatus){ + return crawlStatusRateLimiter; }else{ return serverRateLimiter; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 2123e0c..9442176 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -26,3 +26,11 @@ export interface WebScraperOptions { +export enum RateLimiterMode { + Crawl = "crawl", + CrawlStatus = "crawl-status", + Scrape = "scrape", + Preview = "preview", +} + + From 43c2e877e7a40add2a20bf86603bd7e27b668249 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 14:05:01 -0700 Subject: [PATCH 058/102] Update index.ts --- apps/api/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index fcd26b7..271d96d 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -232,7 +232,7 @@ app.post("/v0/crawl", async (req, res) => { }); app.post("/v0/crawlWebsitePreview", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Preview); if (!success) { return res.status(status).json({ error }); } From 5b3c75b06e3756bfc09a469ee9f029582bbc16c7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 14:10:29 -0700 Subject: [PATCH 059/102] Nick: --- apps/api/src/index.ts | 2 +- apps/api/src/services/rate-limiter.ts | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 271d96d..0fbd91e 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -70,7 +70,7 @@ async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<{ suc return { success: false, error: "Rate limit exceeded. Too many requests, try again in 1 minute.", status: 429 }; } - if (token === "this_is_just_a_preview_token" && mode === "scrape") { + if (token === "this_is_just_a_preview_token" && (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview)) { return { success: true, team_id: "preview" }; } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index dcd05da..b1ee562 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -70,11 +70,12 @@ export function crawlRateLimit(plan: string){ export function getRateLimiter(mode: RateLimiterMode){ - if(mode === RateLimiterMode.Preview){ - return previewRateLimiter; - }else if(mode === RateLimiterMode.CrawlStatus){ - return crawlStatusRateLimiter; - }else{ - return serverRateLimiter; + switch(mode) { + case RateLimiterMode.Preview: + return previewRateLimiter; + case RateLimiterMode.CrawlStatus: + return crawlStatusRateLimiter; + default: + return serverRateLimiter; } } From 23b2190e5df0b7559a634b412b97a6a23152eeaa Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 16:38:05 -0700 Subject: [PATCH 060/102] Nick: --- apps/api/jest.config.js | 3 + apps/api/src/controllers/auth.ts | 67 ++++++ apps/api/src/controllers/crawl-status.ts | 36 +++ apps/api/src/controllers/crawl.ts | 77 +++++++ apps/api/src/controllers/crawlPreview.ts | 37 ++++ apps/api/src/controllers/scrape.ts | 104 +++++++++ apps/api/src/controllers/status.ts | 25 +++ apps/api/src/index.ts | 270 +---------------------- apps/api/src/routes/v0.ts | 14 ++ 9 files changed, 369 insertions(+), 264 deletions(-) create mode 100644 apps/api/src/controllers/auth.ts create mode 100644 apps/api/src/controllers/crawl-status.ts create mode 100644 apps/api/src/controllers/crawl.ts create mode 100644 apps/api/src/controllers/crawlPreview.ts create mode 100644 apps/api/src/controllers/scrape.ts create mode 100644 apps/api/src/controllers/status.ts create mode 100644 apps/api/src/routes/v0.ts diff --git a/apps/api/jest.config.js b/apps/api/jest.config.js index c099257..2854452 100644 --- a/apps/api/jest.config.js +++ b/apps/api/jest.config.js @@ -2,4 +2,7 @@ module.exports = { preset: "ts-jest", testEnvironment: "node", setupFiles: ["./jest.setup.js"], + // ignore dist folder root dir + modulePathIgnorePatterns: ["/dist/"], + }; diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts new file mode 100644 index 0000000..76bacbe --- /dev/null +++ b/apps/api/src/controllers/auth.ts @@ -0,0 +1,67 @@ +import { parseApi } from "../../src/lib/parseApi"; +import { getRateLimiter } from "../../src/services/rate-limiter"; +import { RateLimiterMode } from "../../src/types"; +import { supabase_service } from "../../src/services/supabase"; + +export async function authenticateUser( + req, + res, + mode?: RateLimiterMode +): Promise<{ + success: boolean; + team_id?: string; + error?: string; + status?: number; +}> { + const authHeader = req.headers.authorization; + if (!authHeader) { + return { success: false, error: "Unauthorized", status: 401 }; + } + const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " + if (!token) { + return { + success: false, + error: "Unauthorized: Token missing", + status: 401, + }; + } + + try { + const incomingIP = (req.headers["x-forwarded-for"] || + req.socket.remoteAddress) as string; + const iptoken = incomingIP + token; + await getRateLimiter( + token === "this_is_just_a_preview_token" ? RateLimiterMode.Preview : mode + ).consume(iptoken); + } catch (rateLimiterRes) { + console.error(rateLimiterRes); + return { + success: false, + error: "Rate limit exceeded. Too many requests, try again in 1 minute.", + status: 429, + }; + } + + if ( + token === "this_is_just_a_preview_token" && + (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview) + ) { + return { success: true, team_id: "preview" }; + } + + const normalizedApi = parseApi(token); + // make sure api key is valid, based on the api_keys table in supabase + const { data, error } = await supabase_service + .from("api_keys") + .select("*") + .eq("key", normalizedApi); + if (error || !data || data.length === 0) { + return { + success: false, + error: "Unauthorized: Invalid token", + status: 401, + }; + } + + return { success: true, team_id: data[0].team_id }; +} diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts new file mode 100644 index 0000000..3534cd1 --- /dev/null +++ b/apps/api/src/controllers/crawl-status.ts @@ -0,0 +1,36 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { getWebScraperQueue } from "../../src/services/queue-service"; + +export async function crawlStatusController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.CrawlStatus + ); + if (!success) { + return res.status(status).json({ error }); + } + const job = await getWebScraperQueue().getJob(req.params.jobId); + if (!job) { + return res.status(404).json({ error: "Job not found" }); + } + + const { current, current_url, total, current_step } = await job.progress(); + res.json({ + status: await job.getState(), + // progress: job.progress(), + current: current, + current_url: current_url, + current_step: current_step, + total: total, + data: job.returnvalue, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts new file mode 100644 index 0000000..2f7f842 --- /dev/null +++ b/apps/api/src/controllers/crawl.ts @@ -0,0 +1,77 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; +import { billTeam } from "../../src/services/billing/credit_billing"; +import { checkTeamCredits } from "../../src/services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; + +export async function crawlController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Crawl + ); + if (!success) { + return res.status(status).json({ error }); + } + + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + + // authenticate on supabase + const url = req.body.url; + if (!url) { + return res.status(400).json({ error: "Url is required" }); + } + const mode = req.body.mode ?? "crawl"; + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + + if (mode === "single_urls" && !url.includes(",")) { + try { + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: [url], + crawlerOptions: { + returnOnlyUrls: true, + }, + pageOptions: pageOptions, + }); + + const docs = await a.getDocuments(false, (progress) => { + job.progress({ + current: progress.current, + total: progress.total, + current_step: "SCRAPING", + current_url: progress.currentDocumentUrl, + }); + }); + return res.json({ + success: true, + documents: docs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + } + const job = await addWebScraperJob({ + url: url, + mode: mode ?? "crawl", // fix for single urls not working + crawlerOptions: { ...crawlerOptions }, + team_id: team_id, + pageOptions: pageOptions, + }); + + res.json({ jobId: job.id }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts new file mode 100644 index 0000000..641468c --- /dev/null +++ b/apps/api/src/controllers/crawlPreview.ts @@ -0,0 +1,37 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; + +export async function crawlPreviewController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Preview + ); + if (!success) { + return res.status(status).json({ error }); + } + // authenticate on supabase + const url = req.body.url; + if (!url) { + return res.status(400).json({ error: "Url is required" }); + } + const mode = req.body.mode ?? "crawl"; + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const job = await addWebScraperJob({ + url: url, + mode: mode ?? "crawl", // fix for single urls not working + crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, + team_id: "preview", + pageOptions: pageOptions, + }); + + res.json({ jobId: job.id }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts new file mode 100644 index 0000000..9173533 --- /dev/null +++ b/apps/api/src/controllers/scrape.ts @@ -0,0 +1,104 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; +import { billTeam } from "../../src/services/billing/credit_billing"; +import { checkTeamCredits } from "../../src/services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { logJob } from "../../src/services/logging/log_job"; +import { Document } from "../../src/lib/entities"; + +export async function scrapeHelper( + req: Request, + team_id: string, + crawlerOptions: any, + pageOptions: any +) : Promise<{ success: boolean; error?: string; data?: Document }> { + const url = req.body.url; + if (!url) { + throw new Error("Url is required"); + } + + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: [url], + crawlerOptions: { + ...crawlerOptions, + }, + pageOptions: pageOptions, + }); + + const docs = await a.getDocuments(false); + // make sure doc.content is not empty + const filteredDocs = docs.filter( + (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 + ); + if (filteredDocs.length === 0) { + return { success: true, error: "No pages found" }; + } + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: "Failed to bill team. Insufficient credits or subscription not found.", + }; + } + return { + success: true, + data: filteredDocs[0], + }; +} + +export async function scrapeController(req: Request, res: Response) { + try { + // make sure to authenticate user first, Bearer + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Scrape + ); + if (!success) { + return res.status(status).json({ error }); + } + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + console.error(error); + return res.status(500).json({ error: "Internal server error" }); + } + + const result = await scrapeHelper( + req, + team_id, + crawlerOptions, + pageOptions + ); + logJob({ + success: result.success, + message: result.error, + num_docs: result.data.length, + docs: result.data, + time_taken: 0, + team_id: team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + }); + return res.json(result); + + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts new file mode 100644 index 0000000..bd1d2ea --- /dev/null +++ b/apps/api/src/controllers/status.ts @@ -0,0 +1,25 @@ +import { Request, Response } from "express"; +import { getWebScraperQueue } from "../../src/services/queue-service"; + +export async function crawlJobStatusPreviewController(req: Request, res: Response) { + try { + const job = await getWebScraperQueue().getJob(req.params.jobId); + if (!job) { + return res.status(404).json({ error: "Job not found" }); + } + + const { current, current_url, total, current_step } = await job.progress(); + res.json({ + status: await job.getState(), + // progress: job.progress(), + current: current, + current_url: current_url, + current_step: current_step, + total: total, + data: job.returnvalue, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0fbd91e..57a05f2 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -10,6 +10,7 @@ import { billTeam, checkTeamCredits } from "./services/billing/credit_billing"; import { getRateLimiter, redisClient } from "./services/rate-limiter"; import { parseApi } from "./lib/parseApi"; import { RateLimiterMode } from "./types"; +import { v0Router } from "./routes/v0"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -17,7 +18,6 @@ const { ExpressAdapter } = require("@bull-board/express"); export const app = express(); - global.isProduction = process.env.IS_PRODUCTION === "true"; app.use(bodyParser.urlencoded({ extended: true })); @@ -47,267 +47,8 @@ app.get("/test", async (req, res) => { res.send("Hello, world!"); }); -async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { - const authHeader = req.headers.authorization; - if (!authHeader) { - return { success: false, error: "Unauthorized", status: 401 }; - } - const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " - if (!token) { - return { success: false, error: "Unauthorized: Token missing", status: 401 }; - } - - - - try { - const incomingIP = (req.headers["x-forwarded-for"] || - req.socket.remoteAddress) as string; - const iptoken = incomingIP + token; - await getRateLimiter((token === "this_is_just_a_preview_token") ? RateLimiterMode.Preview : mode - ).consume(iptoken); - } catch (rateLimiterRes) { - console.error(rateLimiterRes); - return { success: false, error: "Rate limit exceeded. Too many requests, try again in 1 minute.", status: 429 }; - } - - if (token === "this_is_just_a_preview_token" && (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview)) { - return { success: true, team_id: "preview" }; - } - - const normalizedApi = parseApi(token); - // make sure api key is valid, based on the api_keys table in supabase - const { data, error } = await supabase_service - .from("api_keys") - .select("*") - .eq("key", normalizedApi); - if (error || !data || data.length === 0) { - return { success: false, error: "Unauthorized: Invalid token", status: 401 }; - } - - return { success: true, team_id: data[0].team_id }; -} - -app.post("/v0/scrape", async (req, res) => { - try { - // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Scrape); - if (!success) { - return res.status(status).json({ error }); - } - const crawlerOptions = req.body.crawlerOptions ?? {}; - - try { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); - } - } catch (error) { - console.error(error); - return res.status(500).json({ error: "Internal server error" }); - } - - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - - try { - const a = new WebScraperDataProvider(); - await a.setOptions({ - mode: "single_urls", - urls: [url], - crawlerOptions: { - ...crawlerOptions, - }, - pageOptions: pageOptions, - }); - - const docs = await a.getDocuments(false); - // make sure doc.content is not empty - const filteredDocs = docs.filter( - (doc: { content?: string }) => - doc.content && doc.content.trim().length > 0 - ); - if (filteredDocs.length === 0) { - return res.status(200).json({ success: true, data: [] }); - } - const { success, credit_usage } = await billTeam( - team_id, - filteredDocs.length - ); - if (!success) { - // throw new Error("Failed to bill team, no subscription was found"); - // return { - // success: false, - // message: "Failed to bill team, no subscription was found", - // docs: [], - // }; - return res - .status(402) - .json({ error: "Failed to bill, no subscription was found" }); - } - return res.json({ - success: true, - data: filteredDocs[0], - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.post("/v0/crawl", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); - if (!success) { - return res.status(status).json({ error }); - } - - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); - } - - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - - if (mode === "single_urls" && !url.includes(",")) { - try { - const a = new WebScraperDataProvider(); - await a.setOptions({ - mode: "single_urls", - urls: [url], - crawlerOptions: { - returnOnlyUrls: true, - }, - pageOptions: pageOptions, - }); - - const docs = await a.getDocuments(false, (progress) => { - job.progress({ - current: progress.current, - total: progress.total, - current_step: "SCRAPING", - current_url: progress.currentDocumentUrl, - }); - }); - return res.json({ - success: true, - documents: docs, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - } - const job = await addWebScraperJob({ - url: url, - mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions }, - team_id: team_id, - pageOptions: pageOptions, - - }); - - res.json({ jobId: job.id }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); -app.post("/v0/crawlWebsitePreview", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Preview); - if (!success) { - return res.status(status).json({ error }); - } - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const job = await addWebScraperJob({ - url: url, - mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, - team_id: "preview", - pageOptions: pageOptions, - }); - - res.json({ jobId: job.id }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.get("/v0/crawl/status/:jobId", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.CrawlStatus); - if (!success) { - return res.status(status).json({ error }); - } - const job = await getWebScraperQueue().getJob(req.params.jobId); - if (!job) { - return res.status(404).json({ error: "Job not found" }); - } - - const { current, current_url, total, current_step } = await job.progress(); - res.json({ - status: await job.getState(), - // progress: job.progress(), - current: current, - current_url: current_url, - current_step: current_step, - total: total, - data: job.returnvalue, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.get("/v0/checkJobStatus/:jobId", async (req, res) => { - try { - const job = await getWebScraperQueue().getJob(req.params.jobId); - if (!job) { - return res.status(404).json({ error: "Job not found" }); - } - - const { current, current_url, total, current_step } = await job.progress(); - res.json({ - status: await job.getState(), - // progress: job.progress(), - current: current, - current_url: current_url, - current_step: current_step, - total: total, - data: job.returnvalue, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); +// register router +app.use(v0Router); const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; @@ -316,7 +57,9 @@ redisClient.connect(); export function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { console.log(`Server listening on port ${port}`); - console.log(`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`); + console.log( + `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); console.log(""); console.log("1. Make sure Redis is running on port 6379 by default"); console.log( @@ -353,4 +96,3 @@ app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); }); - diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts new file mode 100644 index 0000000..023282a --- /dev/null +++ b/apps/api/src/routes/v0.ts @@ -0,0 +1,14 @@ +import express from "express"; +import { crawlController } from "../../src/controllers/crawl"; +import { crawlStatusController } from "../../src/controllers/crawl-status"; +import { scrapeController } from "../../src/controllers/scrape"; +import { crawlPreviewController } from "../../src/controllers/crawlPreview"; +import { crawlJobStatusPreviewController } from "../../src/controllers/status"; + +export const v0Router = express.Router(); + +v0Router.post("/v0/scrape", scrapeController); +v0Router.post("/v0/crawl", crawlController); +v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController); +v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); +v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController); From 5b8aed26dd85a9d5f23e0fd865882dfd5b14a865 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 18:55:39 -0700 Subject: [PATCH 061/102] Update scrape.ts --- apps/api/src/controllers/scrape.ts | 57 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 9173533..04fe525 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -5,17 +5,22 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { logJob } from "../../src/services/logging/log_job"; -import { Document } from "../../src/lib/entities"; +import { Document } from "../../src/lib/entities"; export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, pageOptions: any -) : Promise<{ success: boolean; error?: string; data?: Document }> { +): Promise<{ + success: boolean; + error?: string; + data?: Document; + returnCode?: number; +}> { const url = req.body.url; if (!url) { - throw new Error("Url is required"); + return { success: false, error: "Url is required", returnCode: 400 }; } const a = new WebScraperDataProvider(); @@ -34,7 +39,7 @@ export async function scrapeHelper( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { - return { success: true, error: "No pages found" }; + return { success: true, error: "No page found", returnCode: 200 }; } const { success, credit_usage } = await billTeam( team_id, @@ -43,12 +48,15 @@ export async function scrapeHelper( if (!success) { return { success: false, - error: "Failed to bill team. Insufficient credits or subscription not found.", + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, }; } return { success: true, data: filteredDocs[0], + returnCode: 200, }; } @@ -77,26 +85,25 @@ export async function scrapeController(req: Request, res: Response) { return res.status(500).json({ error: "Internal server error" }); } - const result = await scrapeHelper( - req, - team_id, - crawlerOptions, - pageOptions - ); - logJob({ - success: result.success, - message: result.error, - num_docs: result.data.length, - docs: result.data, - time_taken: 0, - team_id: team_id, - mode: "scrape", - url: req.body.url, - crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - }); - return res.json(result); - + const result = await scrapeHelper( + req, + team_id, + crawlerOptions, + pageOptions + ); + logJob({ + success: result.success, + message: result.error, + num_docs: 1, + docs: [result.data], + time_taken: 0, + team_id: team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + }); + return res.json(result); } catch (error) { console.error(error); return res.status(500).json({ error: error.message }); From 4543c57e4e70dfe072c86c01c77f90a4df535979 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 19:04:27 -0700 Subject: [PATCH 062/102] Nick: --- apps/api/.env.local | 1 + apps/api/src/controllers/scrape.ts | 15 +++++++-------- apps/api/src/index.ts | 8 +------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/apps/api/.env.local b/apps/api/.env.local index f5c625f..6c58f19 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -1,3 +1,4 @@ +ENV= NUM_WORKERS_PER_QUEUE=8 PORT= HOST= diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 04fe525..51d14f2 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,11 +1,10 @@ import { Request, Response } from "express"; -import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; -import { billTeam } from "../../src/services/billing/credit_billing"; -import { checkTeamCredits } from "../../src/services/billing/credit_billing"; +import { WebScraperDataProvider } from "../scraper/WebScraper"; +import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { logJob } from "../../src/services/logging/log_job"; -import { Document } from "../../src/lib/entities"; +import { RateLimiterMode } from "../types"; +import { logJob } from "../services/logging/log_job"; +import { Document } from "../lib/entities"; export async function scrapeHelper( req: Request, @@ -16,7 +15,7 @@ export async function scrapeHelper( success: boolean; error?: string; data?: Document; - returnCode?: number; + returnCode: number; }> { const url = req.body.url; if (!url) { @@ -103,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, }); - return res.json(result); + return res.status(result.returnCode).json(result); } catch (error) { console.error(error); return res.status(500).json({ error: error.message }); diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 57a05f2..1a42eb4 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -3,13 +3,7 @@ import bodyParser from "body-parser"; import cors from "cors"; import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; -import { addWebScraperJob } from "./services/queue-jobs"; -import { supabase_service } from "./services/supabase"; -import { WebScraperDataProvider } from "./scraper/WebScraper"; -import { billTeam, checkTeamCredits } from "./services/billing/credit_billing"; -import { getRateLimiter, redisClient } from "./services/rate-limiter"; -import { parseApi } from "./lib/parseApi"; -import { RateLimiterMode } from "./types"; +import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; const { createBullBoard } = require("@bull-board/api"); From 0db0874b00742e7e7a6439a975501a397da5d6b8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 19:37:45 -0700 Subject: [PATCH 063/102] Nick: --- apps/api/src/controllers/crawl.ts | 2 ++ apps/api/src/controllers/crawlPreview.ts | 2 ++ apps/api/src/controllers/scrape.ts | 8 ++++++-- apps/api/src/main/runWebScraper.ts | 10 +++++++--- apps/api/src/services/logging/log_job.ts | 3 ++- apps/api/src/services/queue-worker.ts | 6 ++++-- apps/api/src/services/webhook.ts | 9 +++++++-- apps/api/src/types.ts | 2 ++ 8 files changed, 32 insertions(+), 10 deletions(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 2f7f842..17cfa62 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -42,6 +42,7 @@ export async function crawlController(req: Request, res: Response) { returnOnlyUrls: true, }, pageOptions: pageOptions, + }); const docs = await a.getDocuments(false, (progress) => { @@ -67,6 +68,7 @@ export async function crawlController(req: Request, res: Response) { crawlerOptions: { ...crawlerOptions }, team_id: team_id, pageOptions: pageOptions, + origin: req.body.origin ?? "api", }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 641468c..3f28ef6 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -21,12 +21,14 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const job = await addWebScraperJob({ url: url, mode: mode ?? "crawl", // fix for single urls not working crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, team_id: "preview", pageOptions: pageOptions, + origin: "website-preview", }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 51d14f2..632fff5 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -72,6 +72,7 @@ export async function scrapeController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const origin = req.body.origin ?? "api"; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -83,24 +84,27 @@ export async function scrapeController(req: Request, res: Response) { console.error(error); return res.status(500).json({ error: "Internal server error" }); } - + const startTime = new Date().getTime(); const result = await scrapeHelper( req, team_id, crawlerOptions, pageOptions ); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; logJob({ success: result.success, message: result.error, num_docs: 1, docs: [result.data], - time_taken: 0, + time_taken: timeTakenInSeconds, team_id: team_id, mode: "scrape", url: req.body.url, crawlerOptions: crawlerOptions, pageOptions: pageOptions, + origin: origin, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 0f562a0..d943429 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -44,7 +44,11 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; -}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { +}): Promise<{ + success: boolean; + message: string; + docs: CrawlResult[]; +}> { try { const provider = new WebScraperDataProvider(); if (mode === "crawl") { @@ -70,7 +74,7 @@ export async function runWebScraper({ return { success: true, message: "No pages found", - docs: [], + docs: [] }; } @@ -87,7 +91,7 @@ export async function runWebScraper({ return { success: false, message: "Failed to bill team, no subscription was found", - docs: [], + docs: [] }; } diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index cb7e648..639b3a8 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -17,11 +17,12 @@ export async function logJob(job: FirecrawlJob) { num_docs: job.num_docs, docs: job.docs, time_taken: job.time_taken, - team_id: job.team_id, + team_id: job.team_id === "preview" ? null : job.team_id, mode: job.mode, url: job.url, crawler_options: job.crawlerOptions, page_options: job.pageOptions, + origin: job.origin, }, ]); if (error) { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index d436401..dda876a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -17,10 +17,11 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); + console.log("Processing job", job.data); const { success, message, docs } = await startWebScraperPipeline({ job }); const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; - + const data = { success: success, result: { @@ -33,7 +34,7 @@ getWebScraperQueue().process( }; await callWebhook(job.data.team_id, data); - + await logJob({ success: success, message: message, @@ -45,6 +46,7 @@ getWebScraperQueue().process( url: job.data.url, crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, + origin: job.data.origin, }); done(null, data); } catch (error) { diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index a086425..ab1f90e 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,6 +1,7 @@ import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, data: any) => { + try { const { data: webhooksData, error } = await supabase_service .from('webhooks') .select('url') @@ -37,5 +38,9 @@ export const callWebhook = async (teamId: string, data: any) => { data: dataToSend, error: data.error || undefined, }), - }); -} \ No newline at end of file + }); + } catch (error) { + console.error(`Error sending webhook for team ID: ${teamId}`, error.message); + } +}; + diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index e3fc5dc..f9e5c73 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -22,6 +22,7 @@ export interface WebScraperOptions { crawlerOptions: any; pageOptions: any; team_id: string; + origin?: string; } @@ -36,6 +37,7 @@ export interface FirecrawlJob { url: string; crawlerOptions?: any; pageOptions?: any; + origin: string; } From 9b31e68a7ef64ededa0531bece1fb340e72a9e70 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 19:38:44 -0700 Subject: [PATCH 064/102] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index dda876a..8d7a7bd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -17,7 +17,7 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); - console.log("Processing job", job.data); + const { success, message, docs } = await startWebScraperPipeline({ job }); const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; From b361a76282e88b678d31306d1469f609f4d135a1 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sat, 20 Apr 2024 19:53:04 -0700 Subject: [PATCH 065/102] Caleb: added logging improvement --- .gitignore | 2 ++ apps/api/.env.local | 14 -------------- apps/api/src/__tests__/e2e/index.test.ts | 14 ++++++++++++-- apps/api/src/services/logtail.ts | 23 +++++++++++++++++++---- 4 files changed, 33 insertions(+), 20 deletions(-) delete mode 100644 apps/api/.env.local diff --git a/.gitignore b/.gitignore index cbfb076..9029012 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ dump.rdb /mongo-data apps/js-sdk/node_modules/ + +apps/api/.env.local diff --git a/apps/api/.env.local b/apps/api/.env.local deleted file mode 100644 index f5c625f..0000000 --- a/apps/api/.env.local +++ /dev/null @@ -1,14 +0,0 @@ -NUM_WORKERS_PER_QUEUE=8 -PORT= -HOST= -SUPABASE_ANON_TOKEN= -SUPABASE_URL= -SUPABASE_SERVICE_TOKEN= -REDIS_URL= -SCRAPING_BEE_API_KEY= -OPENAI_API_KEY= -BULL_AUTH_KEY= -LOGTAIL_KEY= -PLAYWRIGHT_MICROSERVICE_URL= -LLAMAPARSE_API_KEY= -TEST_API_KEY= \ No newline at end of file diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 554453b..ebf87c6 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -3,12 +3,20 @@ import { app } from '../../index'; import dotenv from 'dotenv'; dotenv.config(); -const TEST_URL = 'http://localhost:3002' + +// const TEST_URL = 'http://localhost:3002' +const TEST_URL = 'http://127.0.0.1:3002' + + + + describe('E2E Tests for API Routes', () => { describe('GET /', () => { it('should return Hello, world! message', async () => { - const response = await request(TEST_URL).get('/'); + + const response = await request(TEST_URL).get('/'); + expect(response.statusCode).toBe(200); expect(response.text).toContain('SCRAPERS-JS: Hello, world! Fly.io'); }); @@ -16,6 +24,8 @@ describe('E2E Tests for API Routes', () => { describe('GET /test', () => { it('should return Hello, world! message', async () => { + + const response = await request(TEST_URL).get('/test'); expect(response.statusCode).toBe(200); expect(response.text).toContain('Hello, world!'); diff --git a/apps/api/src/services/logtail.ts b/apps/api/src/services/logtail.ts index 19ab773..8b86a6b 100644 --- a/apps/api/src/services/logtail.ts +++ b/apps/api/src/services/logtail.ts @@ -1,4 +1,19 @@ -const { Logtail } = require("@logtail/node"); -//dot env -require("dotenv").config(); -export const logtail = new Logtail(process.env.LOGTAIL_KEY); +import { Logtail } from "@logtail/node"; +import "dotenv/config"; + +// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided +class MockLogtail { + info(message: string, context?: Record): void { + console.log(message, context); + } + error(message: string, context: Record = {}): void { + console.error(message, context); + } +} + +// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class +// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided +export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => { + console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); + return new MockLogtail(); +})(); From e6b46178ddbe9678036e2c11e51030007a2998ee Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sat, 20 Apr 2024 19:53:27 -0700 Subject: [PATCH 066/102] Caleb: added .env.example --- apps/api/.env.example | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 apps/api/.env.example diff --git a/apps/api/.env.example b/apps/api/.env.example new file mode 100644 index 0000000..392db9a --- /dev/null +++ b/apps/api/.env.example @@ -0,0 +1,18 @@ +# Required +NUM_WORKERS_PER_QUEUE=8 +PORT= +HOST= +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= +REDIS_URL= + +# Optional + +SCRAPING_BEE_API_KEY= +OPENAI_API_KEY= +BULL_AUTH_KEY= +LOGTAIL_KEY= +PLAYWRIGHT_MICROSERVICE_URL= +LLAMAPARSE_API_KEY= +TEST_API_KEY= \ No newline at end of file From d2f808a5fd272f7a9fd845980d2ac0e21147fb99 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 19:54:37 -0700 Subject: [PATCH 067/102] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 8d7a7bd..78ea030 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -17,7 +17,7 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); - + const { success, message, docs } = await startWebScraperPipeline({ job }); const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; @@ -74,6 +74,19 @@ getWebScraperQueue().process( "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, }; await callWebhook(job.data.team_id, data); + await logJob({ + success: false, + message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), + num_docs: 0, + docs: [], + time_taken: 0, + team_id: job.data.team_id, + mode: "crawl", + url: job.data.url, + crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, + origin: job.data.origin, + }); done(null, data); } } From be75aaa195ade4e41e8225cad7ba06e5df661385 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 21 Apr 2024 09:31:22 -0700 Subject: [PATCH 068/102] Caleb: first version of supabase proxy to make db authentication optional --- apps/api/src/controllers/auth.ts | 11 +++++++ apps/api/src/controllers/crawl.ts | 15 +++++---- apps/api/src/controllers/scrape.ts | 26 ++++++++------- apps/api/src/services/supabase.ts | 53 +++++++++++++++++++++++++++--- 4 files changed, 83 insertions(+), 22 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 76bacbe..6ae234d 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -3,6 +3,7 @@ import { getRateLimiter } from "../../src/services/rate-limiter"; import { RateLimiterMode } from "../../src/types"; import { supabase_service } from "../../src/services/supabase"; + export async function authenticateUser( req, res, @@ -13,6 +14,16 @@ export async function authenticateUser( error?: string; status?: number; }> { + + console.log(process.env) + + if(process.env.USE_DB_AUTHENTICATION === "false"){ + console.log("WARNING - YOU'RE bypassing Authentication"); + return { success: true}; + } + + console.log("USING SUPABASE AUTH"); + const authHeader = req.headers.authorization; if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 17cfa62..36c013e 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -8,6 +8,8 @@ import { addWebScraperJob } from "../../src/services/queue-jobs"; export async function crawlController(req: Request, res: Response) { try { + + console.log("hello") const { success, team_id, error, status } = await authenticateUser( req, res, @@ -16,14 +18,15 @@ export async function crawlController(req: Request, res: Response) { if (!success) { return res.status(status).json({ error }); } - - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); + + if (process.env.USE_DB_AUTHENTICATION === "true") { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } } - // authenticate on supabase const url = req.body.url; if (!url) { return res.status(400).json({ error: "Url is required" }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 632fff5..47b00f0 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -40,18 +40,22 @@ export async function scrapeHelper( if (filteredDocs.length === 0) { return { success: true, error: "No page found", returnCode: 200 }; } - const { success, credit_usage } = await billTeam( - team_id, - filteredDocs.length - ); - if (!success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; + + if (process.env.USE_DB_AUTHENTICATION === "true") { + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, + }; + } } + return { success: true, data: filteredDocs[0], diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 49121fa..9a2366d 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,6 +1,49 @@ -import { createClient } from "@supabase/supabase-js"; +import { createClient, SupabaseClient } from '@supabase/supabase-js'; -export const supabase_service = createClient( - process.env.SUPABASE_URL, - process.env.SUPABASE_SERVICE_TOKEN, -); +// SupabaseService class initializes the Supabase client conditionally based on environment variables. +class SupabaseService { + private client: SupabaseClient | null = null; + + constructor() { + const supabaseUrl = process.env.SUPABASE_URL; + const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; + + // Only initialize the Supabase client if both URL and Service Token are provided. + if (process.env.USE_DB_AUTHENTICATION === "false") { + + // Warn the user that Authentication is disabled by setting the client to null + console.warn("\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"); + this.client = null; + } else if (!supabaseUrl || !supabaseServiceToken) { + console.error("\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"); + } else { + this.client = createClient(supabaseUrl, supabaseServiceToken); + } + } + + // Provides access to the initialized Supabase client, if available. + getClient(): SupabaseClient | null { + return this.client; + } +} + +// Using a Proxy to handle dynamic access to the Supabase client or service methods. +// This approach ensures that if Supabase is not configured, any attempt to use it will result in a clear error. +export const supabase_service: SupabaseClient = new Proxy(new SupabaseService(), { + get: function (target, prop, receiver) { + const client = target.getClient(); + // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. + if (client === null) { + console.error("Attempted to access Supabase client when it's not configured."); + return () => { + throw new Error("Supabase client is not configured."); + }; + } + // Direct access to SupabaseService properties takes precedence. + if (prop in target) { + return Reflect.get(target, prop, receiver); + } + // Otherwise, delegate access to the Supabase client. + return Reflect.get(client, prop, receiver); + } +}) as unknown as SupabaseClient; \ No newline at end of file From 5cdbf3a0ac1838219813e064b1bf8d35fc2d538f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 10:36:48 -0700 Subject: [PATCH 069/102] Nick: cleaner functions to handle authenticated requests that dont require ifs everywhere --- apps/api/src/controllers/auth.ts | 18 +++---- apps/api/src/controllers/crawl.ts | 16 +++--- apps/api/src/controllers/scrape.ts | 2 - apps/api/src/lib/withAuth.ts | 19 +++++++ .../src/services/billing/credit_billing.ts | 10 +++- apps/api/src/services/supabase.ts | 51 +++++++++++-------- apps/api/src/types.ts | 10 ++-- 7 files changed, 76 insertions(+), 50 deletions(-) create mode 100644 apps/api/src/lib/withAuth.ts diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 6ae234d..49b2146 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -1,10 +1,15 @@ import { parseApi } from "../../src/lib/parseApi"; import { getRateLimiter } from "../../src/services/rate-limiter"; -import { RateLimiterMode } from "../../src/types"; +import { AuthResponse, RateLimiterMode } from "../../src/types"; import { supabase_service } from "../../src/services/supabase"; +import { withAuth } from "../../src/lib/withAuth"; -export async function authenticateUser( +export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise { + return withAuth(supaAuthenticateUser)(req, res, mode); +} + +export async function supaAuthenticateUser( req, res, mode?: RateLimiterMode @@ -15,15 +20,6 @@ export async function authenticateUser( status?: number; }> { - console.log(process.env) - - if(process.env.USE_DB_AUTHENTICATION === "false"){ - console.log("WARNING - YOU'RE bypassing Authentication"); - return { success: true}; - } - - console.log("USING SUPABASE AUTH"); - const authHeader = req.headers.authorization; if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 36c013e..1fb2698 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -8,8 +8,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs"; export async function crawlController(req: Request, res: Response) { try { - - console.log("hello") + console.log("hello"); const { success, team_id, error, status } = await authenticateUser( req, res, @@ -18,13 +17,11 @@ export async function crawlController(req: Request, res: Response) { if (!success) { return res.status(status).json({ error }); } - - if (process.env.USE_DB_AUTHENTICATION === "true") { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); - } + + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); } const url = req.body.url; @@ -45,7 +42,6 @@ export async function crawlController(req: Request, res: Response) { returnOnlyUrls: true, }, pageOptions: pageOptions, - }); const docs = await a.getDocuments(false, (progress) => { diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 47b00f0..be70800 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -41,7 +41,6 @@ export async function scrapeHelper( return { success: true, error: "No page found", returnCode: 200 }; } - if (process.env.USE_DB_AUTHENTICATION === "true") { const { success, credit_usage } = await billTeam( team_id, filteredDocs.length @@ -54,7 +53,6 @@ export async function scrapeHelper( returnCode: 402, }; } - } return { success: true, diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts new file mode 100644 index 0000000..3ed8906 --- /dev/null +++ b/apps/api/src/lib/withAuth.ts @@ -0,0 +1,19 @@ +import { AuthResponse } from "../../src/types"; + +export function withAuth( + originalFunction: (...args: U) => Promise +) { + return async function (...args: U): Promise { + if (process.env.USE_DB_AUTHENTICATION === "false") { + console.warn("WARNING - You're bypassing authentication"); + return { success: true } as T; + } else { + try { + return await originalFunction(...args); + } catch (error) { + console.error("Error in withAuth function: ", error); + return { success: false, error: error.message } as T; + } + } + }; +} diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 6ac0843..bf5be60 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -1,7 +1,12 @@ +import { withAuth } from "../../lib/withAuth"; import { supabase_service } from "../supabase"; const FREE_CREDITS = 100; + export async function billTeam(team_id: string, credits: number) { + return withAuth(supaBillTeam)(team_id, credits); +} +export async function supaBillTeam(team_id: string, credits: number) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } @@ -52,8 +57,11 @@ export async function billTeam(team_id: string, credits: number) { return { success: true, credit_usage }; } -// if team has enough credits for the operation, return true, else return false export async function checkTeamCredits(team_id: string, credits: number) { + return withAuth(supaCheckTeamCredits)(team_id, credits); +} +// if team has enough credits for the operation, return true, else return false +export async function supaCheckTeamCredits(team_id: string, credits: number) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 9a2366d..fa6404d 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,4 +1,4 @@ -import { createClient, SupabaseClient } from '@supabase/supabase-js'; +import { createClient, SupabaseClient } from "@supabase/supabase-js"; // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -7,15 +7,17 @@ class SupabaseService { constructor() { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; - // Only initialize the Supabase client if both URL and Service Token are provided. if (process.env.USE_DB_AUTHENTICATION === "false") { - // Warn the user that Authentication is disabled by setting the client to null - console.warn("\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"); + console.warn( + "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { - console.error("\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"); + console.error( + "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); } @@ -29,21 +31,26 @@ class SupabaseService { // Using a Proxy to handle dynamic access to the Supabase client or service methods. // This approach ensures that if Supabase is not configured, any attempt to use it will result in a clear error. -export const supabase_service: SupabaseClient = new Proxy(new SupabaseService(), { - get: function (target, prop, receiver) { - const client = target.getClient(); - // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. - if (client === null) { - console.error("Attempted to access Supabase client when it's not configured."); - return () => { - throw new Error("Supabase client is not configured."); - }; - } - // Direct access to SupabaseService properties takes precedence. - if (prop in target) { - return Reflect.get(target, prop, receiver); - } - // Otherwise, delegate access to the Supabase client. - return Reflect.get(client, prop, receiver); +export const supabase_service: SupabaseClient = new Proxy( + new SupabaseService(), + { + get: function (target, prop, receiver) { + const client = target.getClient(); + // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. + if (client === null) { + console.error( + "Attempted to access Supabase client when it's not configured." + ); + return () => { + throw new Error("Supabase client is not configured."); + }; + } + // Direct access to SupabaseService properties takes precedence. + if (prop in target) { + return Reflect.get(target, prop, receiver); + } + // Otherwise, delegate access to the Supabase client. + return Reflect.get(client, prop, receiver); + }, } -}) as unknown as SupabaseClient; \ No newline at end of file +) as unknown as SupabaseClient; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index f9e5c73..7f527fb 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,7 +25,6 @@ export interface WebScraperOptions { origin?: string; } - export interface FirecrawlJob { success: boolean; message: string; @@ -40,8 +39,6 @@ export interface FirecrawlJob { origin: string; } - - export enum RateLimiterMode { Crawl = "crawl", CrawlStatus = "crawl-status", @@ -49,4 +46,9 @@ export enum RateLimiterMode { Preview = "preview", } - +export interface AuthResponse { + success: boolean; + team_id?: string; + error?: string; + status?: number; +} From ef4ffd3a18e3b1c31a51d7fb3a53544f574a6c27 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 21 Apr 2024 10:56:30 -0700 Subject: [PATCH 070/102] Adding contributors guide --- apps/api/.env.example | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/apps/api/.env.example b/apps/api/.env.example index 9a4541c..34e24b1 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -1,18 +1,24 @@ -ENV= -NUM_WORKERS_PER_QUEUE=8 -PORT= -HOST= -SUPABASE_ANON_TOKEN= -SUPABASE_URL= +# ===== Required ENVS ====== +NUM_WORKERS_PER_QUEUE=8 +PORT=3002 +HOST=0.0.0.0 +REDIS_URL=redis://localhost:6379 + +## To turn on DB authentication, you need to set up supabase. +USE_DB_AUTHENTICATION=true + +# ===== Optional ENVS ====== + +# Supabase Setup (used to support DB authentication, advanced logging, etc.) +SUPABASE_ANON_TOKEN= +SUPABASE_URL= SUPABASE_SERVICE_TOKEN= -REDIS_URL= -# Optional - -SCRAPING_BEE_API_KEY= -OPENAI_API_KEY= -BULL_AUTH_KEY= -LOGTAIL_KEY= -PLAYWRIGHT_MICROSERVICE_URL= -LLAMAPARSE_API_KEY= -TEST_API_KEY= \ No newline at end of file +# Other Optionals +TEST_API_KEY= # use if you've set up authentication and want to test with a real API key +SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking +OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +BULL_AUTH_KEY= # +LOGTAIL_KEY= # Use if you're configuring basic logging with logtail +PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs \ No newline at end of file From 401f992c562fd94cb6034bab882c7a70839d4468 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 21 Apr 2024 11:19:40 -0700 Subject: [PATCH 071/102] Caleb: added contributors guide --- CONTRIBUTING.md | 94 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 224eb57..5d4b69e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,8 +1,96 @@ -# Contributing -We love contributions! Our contribution guide will be coming soon! +# Contributors guide: - +Welcome to firecrawl 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) + +If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue! + + +## Hosting locally + +First, start by installing dependencies +1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs) +2. pnpm [instructions](https://pnpm.io/installation) +3. redis - [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) + + +Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. + +To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) + +```.env +# ===== Required ENVS ====== +NUM_WORKERS_PER_QUEUE=8 +PORT=3002 +HOST=0.0.0.0 +REDIS_URL=redis://localhost:6379 + +## To turn on DB authentication, you need to set up supabase. +USE_DB_AUTHENTICATION=false + +# ===== Optional ENVS ====== + +# Supabase Setup (used to support DB authentication, advanced logging, etc.) +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= + +# Other Optionals +TEST_API_KEY= # use if you've set up authentication and want to test with a real API key +SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking +OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +BULL_AUTH_KEY= # +LOGTAIL_KEY= # Use if you're configuring basic logging with logtail +PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs + +``` + +You're going to need to open 3 terminals. + +### Terminal 1 - setting up redis + +Run the command anywhere within your project + +`redis-server` + + +### Terminal 2 - setting up workers + +Now, navigate to the apps/api/ directory and run: +`pnpm run workers` + +### Terminal 3 - setting up the main server + + +To do this, navigate to the apps/api/ directory and run if you don’t have this already, install pnpm here: https://pnpm.io/installation +Next, run your server with`pnpm run start` + + + +### Terminal 3 - sending our first request. + +Alright: now let’s send our first request. + +```curl +curl -X GET http://localhost:3002/test +``` +This should return the response Hello, world! + + +If you’d like to test the crawl endpoint, you can run this + +```curl +curl -X POST http://localhost:3002/v0/crawl \ + -H 'Content-Type: application/json' \ + -d '{ + "url": "https://mendable.ai" + }' +``` + +## Tests: + +The best way to do this is run the test with npx:Once again, navigate to the `apps/api` directory`npx jest` From 898d729a8455785082e2015e695604d1c3c3ff0c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 11:27:31 -0700 Subject: [PATCH 072/102] Nick: tests --- apps/api/src/__tests__/e2e/index.test.ts | 346 +++++++++--------- .../src/__tests__/e2e_noAuth/index.test.ts | 156 ++++++++ apps/api/src/controllers/crawl.ts | 1 - apps/api/src/index.ts | 2 +- 4 files changed, 334 insertions(+), 171 deletions(-) create mode 100644 apps/api/src/__tests__/e2e_noAuth/index.test.ts diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index ebf87c6..ba01a7c 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -1,189 +1,197 @@ -import request from 'supertest'; -import { app } from '../../index'; -import dotenv from 'dotenv'; +import request from "supertest"; +import { app } from "../../index"; +import dotenv from "dotenv"; dotenv.config(); // const TEST_URL = 'http://localhost:3002' -const TEST_URL = 'http://127.0.0.1:3002' +const TEST_URL = "http://127.0.0.1:3002"; - - - -describe('E2E Tests for API Routes', () => { - describe('GET /', () => { - it('should return Hello, world! message', async () => { - - const response = await request(TEST_URL).get('/'); - - expect(response.statusCode).toBe(200); - expect(response.text).toContain('SCRAPERS-JS: Hello, world! Fly.io'); - }); - }); - - describe('GET /test', () => { - it('should return Hello, world! message', async () => { - - - const response = await request(TEST_URL).get('/test'); - expect(response.statusCode).toBe(200); - expect(response.text).toContain('Hello, world!'); - }); - }); - - describe('POST /v0/scrape', () => { - it('should require authorization', async () => { - const response = await request(app).post('/v0/scrape'); - expect(response.statusCode).toBe(401); + describe("E2E Tests for API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; }); - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer invalid-api-key`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(401); + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; }); - it('should return a successful response with a valid preview token', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer this_is_just_a_preview_token`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - }, 10000); // 10 seconds timeout + describe("GET /", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); - it('should return a successful response with a valid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('🔥 FireCrawl'); - }, 30000); // 30 seconds timeout - }); - - describe('POST /v0/crawl', () => { - it('should require authorization', async () => { - const response = await request(TEST_URL).post('/v0/crawl'); - expect(response.statusCode).toBe(401); - }); - - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer invalid-api-key`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(401); - }); - - it('should return a successful response with a valid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('jobId'); - expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); - }); - - // Additional tests for insufficient credits? - }); - - describe('POST /v0/crawlWebsitePreview', () => { - it('should require authorization', async () => { - const response = await request(TEST_URL).post('/v0/crawlWebsitePreview'); - expect(response.statusCode).toBe(401); - }); - - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawlWebsitePreview') - .set('Authorization', `Bearer invalid-api-key`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(401); - }); - - it('should return a successful response with a valid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawlWebsitePreview') - .set('Authorization', `Bearer this_is_just_a_preview_token`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('jobId'); - expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); - }); - }); - - describe('GET /v0/crawl/status/:jobId', () => { - it('should require authorization', async () => { - const response = await request(TEST_URL).get('/v0/crawl/status/123'); - expect(response.statusCode).toBe(401); - }); - - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .get('/v0/crawl/status/123') - .set('Authorization', `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it('should return Job not found for invalid job ID', async () => { - const response = await request(TEST_URL) - .get('/v0/crawl/status/invalidJobId') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it('should return a successful response for a valid crawl job', async () => { - const crawlResponse = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(crawlResponse.statusCode).toBe(200); - - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('status'); - expect(response.body.status).toBe('active'); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); + }); + }); + + describe("GET /test", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it("should require authorization", async () => { + const response = await request(app).post("/v0/scrape"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + it("should return a successful response with a valid preview token", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + }, 10000); // 10 seconds timeout + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("🔥 FireCrawl"); + }, 30000); // 30 seconds timeout + }); + + describe("POST /v0/crawl", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + + // Additional tests for insufficient credits? + }); + + describe("POST /v0/crawlWebsitePreview", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post( + "/v0/crawlWebsitePreview" + ); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it("should return a successful response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); // wait for 30 seconds await new Promise((r) => setTimeout(r, 30000)); const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty('status'); - expect(completedResponse.body.status).toBe('completed'); - expect(completedResponse.body).toHaveProperty('data'); - expect(completedResponse.body.data[0]).toHaveProperty('content'); - expect(completedResponse.body.data[0]).toHaveProperty('markdown'); - expect(completedResponse.body.data[0]).toHaveProperty('metadata'); - expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl'); - }, 60000); // 60 seconds - }); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + }, 60000); // 60 seconds + }); - describe('GET /is-production', () => { - it('should return the production status', async () => { - const response = await request(TEST_URL).get('/is-production'); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('isProduction'); + describe("GET /is-production", () => { + it("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); }); }); -}); \ No newline at end of file diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts new file mode 100644 index 0000000..e0aca36 --- /dev/null +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -0,0 +1,156 @@ +import request from "supertest"; +import { app } from "../../index"; +import dotenv from "dotenv"; +const fs = require("fs"); +const path = require("path"); + +dotenv.config(); + +const TEST_URL = "http://127.0.0.1:3002"; + +describe("E2E Tests for API Routes with No Authentication", () => { + let originalEnv: NodeJS.ProcessEnv; + + // save original process.env + beforeAll(() => { + originalEnv = { ...process.env }; + process.env.USE_DB_AUTHENTICATION = "false"; + process.env.SUPABASE_ANON_TOKEN = ""; + process.env.SUPABASE_URL = ""; + process.env.SUPABASE_SERVICE_TOKEN = ""; + process.env.SCRAPING_BEE_API_KEY = ""; + process.env.OPENAI_API_KEY = ""; + process.env.BULL_AUTH_KEY = ""; + process.env.LOGTAIL_KEY = ""; + process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; + process.env.LLAMAPARSE_API_KEY = ""; + process.env.TEST_API_KEY = ""; + }); + + // restore original process.env + afterAll(() => { + process.env = originalEnv; + }); + + + describe("GET /", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); + }); + }); + + describe("GET /test", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).post("/v0/scrape"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + }, 10000); // 10 seconds timeout + }); + + describe("POST /v0/crawl", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("POST /v0/crawlWebsitePreview", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL).get( + "/v0/crawl/status/invalidJobId" + ); + expect(response.statusCode).toBe(404); + }); + + it("should return a successful response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL).get( + `/v0/crawl/status/${crawlResponse.body.jobId}` + ); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL).get( + `/v0/crawl/status/${crawlResponse.body.jobId}` + ); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + }, 60000); // 60 seconds + }); + + describe("GET /is-production", () => { + it("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); +}); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 1fb2698..bd3feca 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -8,7 +8,6 @@ import { addWebScraperJob } from "../../src/services/queue-jobs"; export async function crawlController(req: Request, res: Response) { try { - console.log("hello"); const { success, team_id, error, status } = await authenticateUser( req, res, diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 1a42eb4..a2e5c51 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -5,7 +5,6 @@ import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; - const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); @@ -48,6 +47,7 @@ const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; redisClient.connect(); + export function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { console.log(`Server listening on port ${port}`); From 52620bab16e087bfa4c9d1f11ca91af8f1f79632 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 11:39:36 -0700 Subject: [PATCH 073/102] Nick: prod and local-no-auth tests --- .github/workflows/ci.yml | 2 +- apps/api/package.json | 2 ++ apps/api/src/__tests__/{e2e => e2e_withAuth}/index.test.ts | 0 apps/api/src/lib/withAuth.ts | 7 ++++++- 4 files changed, 9 insertions(+), 2 deletions(-) rename apps/api/src/__tests__/{e2e => e2e_withAuth}/index.test.ts (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9a5b79..69a8a24 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,5 +54,5 @@ jobs: id: start_workers - name: Run E2E tests run: | - npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false + npm run test:prod working-directory: ./apps/api \ No newline at end of file diff --git a/apps/api/package.json b/apps/api/package.json index cbce4be..0b533f9 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -11,6 +11,8 @@ "start:dev": "nodemon --exec ts-node src/index.ts", "build": "tsc", "test": "jest --verbose", + "test:local-no-auth":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", + "test:prod":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts similarity index 100% rename from apps/api/src/__tests__/e2e/index.test.ts rename to apps/api/src/__tests__/e2e_withAuth/index.test.ts diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index 3ed8906..ea5aa4d 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,11 +1,16 @@ import { AuthResponse } from "../../src/types"; +let warningCount = 0; + export function withAuth( originalFunction: (...args: U) => Promise ) { return async function (...args: U): Promise { if (process.env.USE_DB_AUTHENTICATION === "false") { - console.warn("WARNING - You're bypassing authentication"); + if (warningCount < 5) { + console.warn("WARNING - You're bypassing authentication"); + warningCount++; + } return { success: true } as T; } else { try { From 30a8482a68e42084a36a892921854fa49144b524 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 11:41:34 -0700 Subject: [PATCH 074/102] Nick: --- README.md | 2 +- SELF_HOST.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 56f8c5c..f6b67b7 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ We provide an easy to use API with our hosted version. You can find the playgrou - [ ] LangchainJS - Coming Soon -Self-host. To self-host refer to guide [here](https://github.com/mendableai/firecrawl/blob/main/SELF_HOST.md). +To run locally, refer to guide [here](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md). ### API Key diff --git a/SELF_HOST.md b/SELF_HOST.md index ba0ae23..8d1d490 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,6 +1,6 @@ # Self-hosting Firecrawl -Guide coming soon. +Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. *This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* From 2f29a4da8eb3b10b9c9782e17e46d662ec3010c9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 11:45:15 -0700 Subject: [PATCH 075/102] Update CONTRIBUTING.md --- CONTRIBUTING.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5d4b69e..abd3027 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,12 +1,12 @@ # Contributors guide: -Welcome to firecrawl 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) +Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue! -## Hosting locally +## Running the project locally First, start by installing dependencies 1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs) @@ -46,6 +46,16 @@ LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse ``` +### Installing dependencies + +First, install the dependencies using pnpm. + +```bash +pnpm install +``` + +### Running the project + You're going to need to open 3 terminals. ### Terminal 1 - setting up redis @@ -64,7 +74,7 @@ Now, navigate to the apps/api/ directory and run: To do this, navigate to the apps/api/ directory and run if you don’t have this already, install pnpm here: https://pnpm.io/installation -Next, run your server with`pnpm run start` +Next, run your server with `pnpm run start` @@ -90,7 +100,8 @@ curl -X POST http://localhost:3002/v0/crawl \ ## Tests: -The best way to do this is run the test with npx:Once again, navigate to the `apps/api` directory`npx jest` - +The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. + +If you'd like to run the tests with authentication, run `npm run test:prod` From 84be3d2bcaa6af7263b8aaff5b71bdb805eb28e0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 11:51:39 -0700 Subject: [PATCH 076/102] Update CONTRIBUTING.md --- CONTRIBUTING.md | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index abd3027..733c787 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,3 @@ - # Contributors guide: Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) @@ -11,14 +10,15 @@ If you're contributing, note that the process is similar to other open source re First, start by installing dependencies 1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs) 2. pnpm [instructions](https://pnpm.io/installation) -3. redis - [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) +3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) -```.env +.env: +``` # ===== Required ENVS ====== NUM_WORKERS_PER_QUEUE=8 PORT=3002 @@ -62,21 +62,28 @@ You're going to need to open 3 terminals. Run the command anywhere within your project -`redis-server` +```bash +redis-server +``` - ### Terminal 2 - setting up workers Now, navigate to the apps/api/ directory and run: -`pnpm run workers` - +```bash +pnpm run workers +``` + +This will start the workers who are responsible for processing crawl jobs. + ### Terminal 3 - setting up the main server To do this, navigate to the apps/api/ directory and run if you don’t have this already, install pnpm here: https://pnpm.io/installation -Next, run your server with `pnpm run start` - +Next, run your server with: +```bash +pnpm run start +``` ### Terminal 3 - sending our first request. From 6560c968e1ba182dfff2765bf7751e623e2d175f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 12:02:11 -0700 Subject: [PATCH 077/102] Update types.ts --- apps/api/src/types.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 7f527fb..5d778a2 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -52,3 +52,5 @@ export interface AuthResponse { error?: string; status?: number; } + + From 001bf0c504df8cb9ff08673fab69cdbeb3413dd7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 12:05:12 -0700 Subject: [PATCH 078/102] Update package.json --- apps/api/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/package.json b/apps/api/package.json index 0b533f9..8ae1609 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -10,7 +10,7 @@ "flyio": "node dist/src/index.js", "start:dev": "nodemon --exec ts-node src/index.ts", "build": "tsc", - "test": "jest --verbose", + "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test:local-no-auth":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", "test:prod":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", From 3ead2efdcaab6fd407eddb01e3654c88ac6bfba9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Apr 2024 12:05:30 -0700 Subject: [PATCH 079/102] Update fly.yml --- .github/workflows/fly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index fe042c6..ddeee55 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -54,7 +54,7 @@ jobs: id: start_workers - name: Run E2E tests run: | - npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false + npm run test:prod working-directory: ./apps/api deploy: name: Deploy app From 572b7e8dc57a7768321d15ef34dc688ad6337a94 Mon Sep 17 00:00:00 2001 From: Matt <77928207+mattzcarey@users.noreply.github.com> Date: Mon, 22 Apr 2024 16:38:05 +0100 Subject: [PATCH 080/102] chore: add context.close --- apps/playwright-service/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index 5d6f331..b4b83de 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -21,6 +21,7 @@ async def root(body: UrlModel): # Using Pydantic model for request body await page.goto(body.url) # Adjusted to use the url from the request body model page_content = await page.content() # Get the HTML content of the page + await context.close() await browser.close() json_compatible_item_data = {"content": page_content} From de7e1f501bc9708a7d018dce81abd40944eadd6a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 22 Apr 2024 08:41:54 -0700 Subject: [PATCH 081/102] Update openapi.json --- apps/api/openapi.json | 566 ++++++++++++++++++++++-------------------- 1 file changed, 290 insertions(+), 276 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index bb58ae3..3916738 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -1,295 +1,309 @@ { - "openapi": "3.0.0", - "info": { - "title": "Firecrawl API", - "version": "1.0.0", - "description": "API for interacting with Firecrawl services to convert websites to LLM-ready data.", - "contact": { - "name": "Firecrawl Support", - "url": "https://firecrawl.dev/support", - "email": "help@mendable.ai" - } - }, - "servers": [ - { - "url": "https://api.firecrawl.dev/v0" - } - ], - "paths": { - "/scrape": { - "post": { - "summary": "Scrape a single URL", - "operationId": "scrapeSingleUrl", - "tags": ["Scraping"], - "security": [ - { - "bearerAuth": [] - } - ], - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "format": "uri", - "description": "The URL to scrape" - } - }, - "required": ["url"] - } - } - } - }, - "responses": { - "200": { - "description": "Successful response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScrapeResponse" - } - } - } - }, - "402": { - "description": "Payment required" - }, - "429": { - "description": "Too many requests" - }, - "500": { - "description": "Server error" - } + "openapi": "3.0.0", + "info": { + "title": "Firecrawl API", + "version": "1.0.0", + "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.", + "contact": { + "name": "Firecrawl Support", + "url": "https://firecrawl.dev/support", + "email": "support@firecrawl.dev" + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ], + "paths": { + "/scrape": { + "post": { + "summary": "Scrape a single URL", + "operationId": "scrapeSingleUrl", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] } - } - }, - "/crawl": { - "post": { - "summary": "Crawl multiple URLs based on options", - "operationId": "crawlUrls", - "tags": ["Crawling"], - "security": [ - { - "bearerAuth": [] - } - ], - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "format": "uri", - "description": "The base URL to start crawling from" - }, - "crawlerOptions": { - "type": "object", - "properties": { - "includes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to include" - }, - "excludes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to exclude" - }, - "generateImgAltText": { - "type": "boolean", - "description": "Generate alt text for images using LLMs (must have a paid plan)", - "default": false - }, - "limit": { - "type": "integer", - "description": "Maximum number of pages to crawl" - } - } - }, - "pageOptions": { - "type": "object", - "properties": { - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": false - } - } - } - }, - "required": ["url"] - } - } - } - }, - "responses": { - "200": { - "description": "Successful response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CrawlResponse" - } - } - } - }, - "402": { - "description": "Payment required" - }, - "429": { - "description": "Too many requests" - }, - "500": { - "description": "Server error" - } - } - } - }, - "/crawl/status/{jobId}": { - "get": { - "tags": ["Crawl"], - "summary": "Get the status of a crawl job", - "operationId": "getCrawlStatus", - "security": [ - { - "bearerAuth": [] - } - ], - "parameters": [ - { - "name": "jobId", - "in": "path", - "description": "ID of the crawl job", - "required": true, + ], + "requestBody": { + "required": true, + "content": { + "application/json": { "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Successful response", - "content": { - "application/json": { - "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + }, + "pageOptions": { "type": "object", "properties": { - "status": { - "type": "string", - "description": "Status of the job (completed, active, failed, paused)" - }, - "current": { - "type": "integer", - "description": "Current page number" - }, - "current_url": { - "type": "string", - "description": "Current URL being scraped" - }, - "current_step": { - "type": "string", - "description": "Current step in the process" - }, - "total": { - "type": "integer", - "description": "Total number of pages" - }, - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScrapeResponse" - }, - "description": "Data returned from the job (null when it is in progress)" + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": false } } } - } - } - }, - "402": { - "description": "Payment required" - }, - "429": { - "description": "Too many requests" - }, - "500": { - "description": "Server error" - } - } - } - } - }, - "components": { - "securitySchemes": { - "bearerAuth": { - "type": "http", - "scheme": "bearer" - } - }, - "schemas": { - "ScrapeResponse": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "data": { - "type": "object", - "properties": { - "content": { - "type": "string" }, - "markdown": { - "type": "string" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string", - "nullable": true - }, - "sourceURL": { - "type": "string", - "format": "uri" - } - } - } + "required": ["url"] } } } }, - "CrawlResponse": { - "type": "object", - "properties": { - "jobId": { - "type": "string" + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScrapeResponse" + } + } } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" } } } }, - "security": [ - { - "bearerAuth": [] + "/crawl": { + "post": { + "summary": "Crawl multiple URLs based on options", + "operationId": "crawlUrls", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "crawlerOptions": { + "type": "object", + "properties": { + "includes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to include" + }, + "excludes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to exclude" + }, + "generateImgAltText": { + "type": "boolean", + "description": "Generate alt text for images using LLMs (must have a paid plan)", + "default": false + }, + "returnOnlyUrls": { + "type": "boolean", + "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of pages to crawl" + } + } + }, + "pageOptions": { + "type": "object", + "properties": { + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": false + } + } + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlResponse" + } + } + } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" + } + } } - ] - } - \ No newline at end of file + }, + "/crawl/status/{jobId}": { + "get": { + "tags": ["Crawl"], + "summary": "Get the status of a crawl job", + "operationId": "getCrawlStatus", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "jobId", + "in": "path", + "description": "ID of the crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Status of the job (completed, active, failed, paused)" + }, + "current": { + "type": "integer", + "description": "Current page number" + }, + "current_url": { + "type": "string", + "description": "Current URL being scraped" + }, + "current_step": { + "type": "string", + "description": "Current step in the process" + }, + "total": { + "type": "integer", + "description": "Total number of pages" + }, + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScrapeResponse" + }, + "description": "Data returned from the job (null when it is in progress)" + } + } + } + } + } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" + } + } + } + } + }, + "components": { + "securitySchemes": { + "bearerAuth": { + "type": "http", + "scheme": "bearer" + } + }, + "schemas": { + "ScrapeResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + } + } + } + } + } + } + }, + "CrawlResponse": { + "type": "object", + "properties": { + "jobId": { + "type": "string" + } + } + } + } + }, + "security": [ + { + "bearerAuth": [] + } + ] +} From 18450b5f9a51c20fa7464930c2065c0de478ae37 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 22 Apr 2024 12:42:46 -0700 Subject: [PATCH 082/102] Nick: tutorials --- tutorials/data-extraction-using-llms.mdx | 95 ++++++++++++++++++++++++ tutorials/rag-llama3.mdx | 91 +++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 tutorials/data-extraction-using-llms.mdx create mode 100644 tutorials/rag-llama3.mdx diff --git a/tutorials/data-extraction-using-llms.mdx b/tutorials/data-extraction-using-llms.mdx new file mode 100644 index 0000000..554e787 --- /dev/null +++ b/tutorials/data-extraction-using-llms.mdx @@ -0,0 +1,95 @@ +--- +title: "Extract website data using LLMs" +description: "Learn how to use Firecrawl and Groq to extract structured data from a web page in a few lines of code." +'og:image': "/images/og.png" +'twitter:image': "/images/og.png" +--- + +## Setup + +Install our python dependencies, including groq and firecrawl-py. + +```bash +pip install groq firecrawl-py +``` + +## Getting your Groq and Firecrawl API Keys + +To use Groq and Firecrawl, you will need to get your API keys. You can get your Groq API key from [here](https://groq.com) and your Firecrawl API key from [here](https://firecrawl.dev). + +## Load website with Firecrawl + +To be able to get all the data from a website page and make sure it is in the cleanest format, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy. + +Here is how we will scrape a website url using Firecrawl. We will also set a `pageOptions` for only extracting the main content (`onlyMainContent: True`) of the website page - excluding the navs, footers, etc. + +```python +from firecrawl import FirecrawlApp # Importing the FireCrawlLoader + +url = "https://about.fb.com/news/2024/04/introducing-our-open-mixed-reality-ecosystem/" + +firecrawl = FirecrawlApp( + api_key="fc-YOUR_FIRECRAWL_API_KEY", +) +page_content = firecrawl.scrape_url(url=url, # Target URL to crawl + params={ + "pageOptions":{ + "onlyMainContent": True # Ignore navs, footers, etc. + } + }) +print(page_content) +``` + +Perfect, now we have clean data from the website - ready to be fed to the LLM for data extraction. + +## Extraction and Generation + +Now that we have the website data, let's use Groq to pull out the information we need. We'll use Groq Llama 3 model in JSON mode and pick out certain fields from the page content. + +We are using LLama 3 8b model for this example. Feel free to use bigger models for improved results. + +```python +import json +from groq import Groq + +client = Groq( + api_key="gsk_YOUR_GROQ_API_KEY", # Note: Replace 'API_KEY' with your actual Groq API key +) + +# Here we define the fields we want to extract from the page content +extract = ["summary","date","companies_building_with_quest","title_of_the_article","people_testimonials"] + +completion = client.chat.completions.create( + model="llama3-8b-8192", + messages=[ + { + "role": "system", + "content": "You are a legal advisor who extracts information from documents in JSON." + }, + { + "role": "user", + # Here we pass the page content and the fields we want to extract + "content": f"Extract the following information from the provided documentation:\Page content:\n\n{page_content}\n\nInformation to extract: {extract}" + } + ], + temperature=0, + max_tokens=1024, + top_p=1, + stream=False, + stop=None, + # We set the response format to JSON object + response_format={"type": "json_object"} +) + + +# Pretty print the JSON response +dataExtracted = json.dumps(str(completion.choices[0].message.content), indent=4) + +print(dataExtracted) +``` + +## And Voila! + +You have now built a data extraction bot using Groq and Firecrawl. You can now use this bot to extract structured data from any website. + +If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev). \ No newline at end of file diff --git a/tutorials/rag-llama3.mdx b/tutorials/rag-llama3.mdx new file mode 100644 index 0000000..ae9c48f --- /dev/null +++ b/tutorials/rag-llama3.mdx @@ -0,0 +1,91 @@ +--- +title: "Build a 'Chat with website' using Groq Llama 3" +description: "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot." +--- + +## Setup + +Install our python dependencies, including langchain, groq, faiss, ollama, and firecrawl-py. + +```bash +pip install --upgrade --quiet langchain langchain-community groq faiss-cpu ollama firecrawl-py +``` + +We will be using Ollama for the embeddings, you can download Ollama [here](https://ollama.com/). But feel free to use any other embeddings you prefer. + +## Load website with Firecrawl + +To be able to get all the data from a website and make sure it is in the cleanest format, we will use FireCrawl. Firecrawl integrates very easily with Langchain as a document loader. + +Here is how you can load a website with FireCrawl: + +```python +from langchain_community.document_loaders import FireCrawlLoader # Importing the FireCrawlLoader + +url = "https://firecrawl.dev" +loader = FireCrawlLoader( + api_key="fc-YOUR_API_KEY", # Note: Replace 'YOUR_API_KEY' with your actual FireCrawl API key + url=url, # Target URL to crawl + mode="crawl" # Mode set to 'crawl' to crawl all accessible subpages +) +docs = loader.load() +``` + +## Setup the Vectorstore + +Next, we will setup the vectorstore. The vectorstore is a data structure that allows us to store and query embeddings. We will use the Ollama embeddings and the FAISS vectorstore. +We split the documents into chunks of 1000 characters each, with a 200 character overlap. This is to ensure that the chunks are not too small and not too big - and that it can fit into the LLM model when we query it. + +```python +from langchain_community.embeddings import OllamaEmbeddings +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.vectorstores import FAISS + +text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) +splits = text_splitter.split_documents(docs) +vectorstore = FAISS.from_documents(documents=splits, embedding=OllamaEmbeddings()) +``` + +## Retrieval and Generation + +Now that our documents are loaded and the vectorstore is setup, we can, based on user's question, do a similarity search to retrieve the most relevant documents. That way we can use these documents to be fed to the LLM model. + + +```python +question = "What is firecrawl?" +docs = vectorstore.similarity_search(query=question) +``` + +## Generation +Last but not least, you can use the Groq to generate a response to a question based on the documents we have loaded. + +```python +from groq import Groq + +client = Groq( + api_key="YOUR_GROQ_API_KEY", +) + +completion = client.chat.completions.create( + model="llama3-8b-8192", + messages=[ + { + "role": "user", + "content": f"You are a friendly assistant. Your job is to answer the users question based on the documentation provided below:\nDocs:\n\n{docs}\n\nQuestion: {question}" + } + ], + temperature=1, + max_tokens=1024, + top_p=1, + stream=False, + stop=None, +) + +print(completion.choices[0].message) +``` + +## And Voila! + +You have now built a 'Chat with your website' bot using Llama 3, Groq Llama 3, Langchain, and Firecrawl. You can now use this bot to answer questions based on the documentation of your website. + +If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev). \ No newline at end of file From b33133f80bf8a25decf21832b4cde5d26166abd5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 22 Apr 2024 12:45:44 -0700 Subject: [PATCH 083/102] Update data-extraction-using-llms.mdx --- tutorials/data-extraction-using-llms.mdx | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tutorials/data-extraction-using-llms.mdx b/tutorials/data-extraction-using-llms.mdx index 554e787..879c1e7 100644 --- a/tutorials/data-extraction-using-llms.mdx +++ b/tutorials/data-extraction-using-llms.mdx @@ -1,9 +1,6 @@ ---- -title: "Extract website data using LLMs" -description: "Learn how to use Firecrawl and Groq to extract structured data from a web page in a few lines of code." -'og:image': "/images/og.png" -'twitter:image': "/images/og.png" ---- +# Extract website data using LLMs + +Learn how to use Firecrawl and Groq to extract structured data from a web page in a few lines of code. With Groq fast inference speeds and firecrawl parellization, you can extract data from web pages *super* fast. ## Setup @@ -92,4 +89,4 @@ print(dataExtracted) You have now built a data extraction bot using Groq and Firecrawl. You can now use this bot to extract structured data from any website. -If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev). \ No newline at end of file +If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev). From bf2df7a8535c02bc457bd0d4cbdcde6ea3a2d8be Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 10:55:40 -0700 Subject: [PATCH 084/102] Nick: fix js-sdk --- apps/js-sdk/firecrawl/build/index.js | 2 +- apps/js-sdk/firecrawl/package.json | 4 +++- apps/js-sdk/firecrawl/src/index.ts | 4 ++-- apps/js-sdk/firecrawl/types/index.d.ts | 4 ++-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index 25ae999..1b23bb5 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -67,7 +67,7 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the crawl request. * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The response from the crawl operation. + * @returns {Promise} The response from the crawl operation. */ crawlUrl(url_1) { return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) { diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 811f87f..566fdde 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,11 +1,13 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.11", + "version": "0.0.13", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", "type": "module", "scripts": { + "build": "tsc", + "publish":"npm run build && npm publish --access public", "test": "echo \"Error: no test specified\" && exit 1" }, "repository": { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index be55066..6545600 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -102,9 +102,9 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the crawl request. * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The response from the crawl operation. + * @returns {Promise} The response from the crawl operation. */ - async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise { + async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise { const headers = this.prepareHeaders(); let jsonData: Params = { url }; if (params) { diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index a9d04ba..be960f7 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -61,9 +61,9 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the crawl request. * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The response from the crawl operation. + * @returns {Promise} The response from the crawl operation. */ - crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise; + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise; /** * Checks the status of a crawl job using the Firecrawl API. * @param {string} jobId - The job ID of the crawl operation. From 306cfe4ce1d6b13b574be02315a0b2b80cdc4344 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 11:15:11 -0700 Subject: [PATCH 085/102] Nick: --- apps/api/package.json | 5 +++-- apps/api/pnpm-lock.yaml | 7 +++++++ apps/api/src/lib/html-to-markdown.ts | 4 +++- apps/api/src/scraper/WebScraper/single_url.ts | 1 + 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 8ae1609..07e3b7a 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -11,8 +11,8 @@ "start:dev": "nodemon --exec ts-node src/index.ts", "build": "tsc", "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", - "test:local-no-auth":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", - "test:prod":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", + "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", + "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", @@ -66,6 +66,7 @@ "glob": "^10.3.12", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.3.2", + "joplin-turndown-plugin-gfm": "^1.0.12", "keyword-extractor": "^0.0.25", "langchain": "^0.1.25", "languagedetect": "^2.0.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index df669d5..5298d2b 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -80,6 +80,9 @@ dependencies: ioredis: specifier: ^5.3.2 version: 5.3.2 + joplin-turndown-plugin-gfm: + specifier: ^1.0.12 + version: 1.0.12 keyword-extractor: specifier: ^0.0.25 version: 0.0.25 @@ -3923,6 +3926,10 @@ packages: - ts-node dev: true + /joplin-turndown-plugin-gfm@1.0.12: + resolution: {integrity: sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA==} + dev: false + /js-tiktoken@1.0.10: resolution: {integrity: sha512-ZoSxbGjvGyMT13x6ACo9ebhDha/0FHdKA+OsQcMOWcm1Zs7r90Rhk5lhERLzji+3rA7EKpXCgwXcM5fF3DMpdA==} dependencies: diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 0fd8c93..e084f5e 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,6 +1,8 @@ + export function parseMarkdown(html: string) { var TurndownService = require("turndown"); - var turndownPluginGfm = require("turndown-plugin-gfm"); + var turndownPluginGfm = require('joplin-turndown-plugin-gfm') + const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fbcd923..0f3cc38 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -142,6 +142,7 @@ export async function scrapSingleUrl( break; } let cleanedHtml = removeUnwantedElements(text, pageOptions); + return [await parseMarkdown(cleanedHtml), text]; }; From a680c7ce84985863607d1c10eacae481c28bd29a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:46:29 -0300 Subject: [PATCH 086/102] [Feat] Server health check + slack message --- apps/api/.env.example | 3 +- apps/api/requests.http | 11 ++++++- apps/api/src/index.ts | 70 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/apps/api/.env.example b/apps/api/.env.example index 34e24b1..3cd40c1 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -21,4 +21,5 @@ OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback -LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs \ No newline at end of file +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages \ No newline at end of file diff --git a/apps/api/requests.http b/apps/api/requests.http index 2350136..751ba5e 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -49,4 +49,13 @@ content-type: application/json ### Check Job Status GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 -Authorization: Bearer \ No newline at end of file +Authorization: Bearer + +### Get Active Jobs Count +GET http://localhost:3002/serverHealthCheck +content-type: application/json + +### Notify Server Health Check +GET http://localhost:3002/serverHealthCheck/notify +content-type: application/json + diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index a2e5c51..6417f36 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -87,6 +87,76 @@ app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { } }); +app.get(`/serverHealthCheck`, async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const [activeJobs] = await Promise.all([ + webScraperQueue.getActiveCount(), + ]); + + const noActiveJobs = activeJobs === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noActiveJobs ? 200 : 500).json({ + activeJobs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +}); + +app.get('/serverHealthCheck/notify', async (req, res) => { + if (process.env.SLACK_WEBHOOK_URL) { + const treshold = 5; // The treshold value for the active jobs + const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds + + const getActiveJobs = async () => { + const webScraperQueue = getWebScraperQueue(); + const [activeJobs] = await Promise.all([ + webScraperQueue.getActiveCount(), + ]); + + return activeJobs; + }; + + res.status(200).json({ message: "Check initiated" }); + + const checkActiveJobs = async () => { + try { + let activeJobs = await getActiveJobs(); + if (activeJobs >= treshold) { + setTimeout(async () => { + activeJobs = await getActiveJobs(); // Re-check the active jobs count + if (activeJobs >= treshold) { + const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; + const message = { + text: `⚠️ Warning: The number of active jobs (${activeJobs}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, + }; + + const response = await fetch(slackWebhookUrl, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(message), + }) + + if (!response.ok) { + console.error('Failed to send Slack notification') + } + } + }, timeout); + } + } catch (error) { + console.error(error); + } + }; + + checkActiveJobs(); + } +}); + + app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); }); From 9b01dc62817dca9488d890f0f58a5c4e654e7fa1 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Apr 2024 16:07:22 -0300 Subject: [PATCH 087/102] Changed from active to waiting jobs --- apps/api/src/index.ts | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 6417f36..27e8713 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -90,14 +90,14 @@ app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { app.get(`/serverHealthCheck`, async (req, res) => { try { const webScraperQueue = getWebScraperQueue(); - const [activeJobs] = await Promise.all([ - webScraperQueue.getActiveCount(), + const [waitingJobs] = await Promise.all([ + webScraperQueue.getWaitingCount(), ]); - const noActiveJobs = activeJobs === 0; + const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs - return res.status(noActiveJobs ? 200 : 500).json({ - activeJobs, + return res.status(noWaitingJobs ? 200 : 500).json({ + waitingJobs, }); } catch (error) { console.error(error); @@ -107,30 +107,31 @@ app.get(`/serverHealthCheck`, async (req, res) => { app.get('/serverHealthCheck/notify', async (req, res) => { if (process.env.SLACK_WEBHOOK_URL) { - const treshold = 5; // The treshold value for the active jobs + const treshold = 1; // The treshold value for the active jobs const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds - const getActiveJobs = async () => { + const getWaitingJobsCount = async () => { const webScraperQueue = getWebScraperQueue(); - const [activeJobs] = await Promise.all([ - webScraperQueue.getActiveCount(), + const [waitingJobsCount] = await Promise.all([ + webScraperQueue.getWaitingCount(), ]); - return activeJobs; + return waitingJobsCount; }; res.status(200).json({ message: "Check initiated" }); - const checkActiveJobs = async () => { + const checkWaitingJobs = async () => { try { - let activeJobs = await getActiveJobs(); - if (activeJobs >= treshold) { + let waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { setTimeout(async () => { - activeJobs = await getActiveJobs(); // Re-check the active jobs count - if (activeJobs >= treshold) { + // Re-check the waiting jobs count after the timeout + waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; const message = { - text: `⚠️ Warning: The number of active jobs (${activeJobs}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, + text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, }; const response = await fetch(slackWebhookUrl, { @@ -152,7 +153,7 @@ app.get('/serverHealthCheck/notify', async (req, res) => { } }; - checkActiveJobs(); + checkWaitingJobs(); } }); From 849c0b6ebfcc0c7d0e202330c7df0d6260c4b1a0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Apr 2024 18:50:35 -0300 Subject: [PATCH 088/102] [Feat] Added blocklist for social media urls --- .../src/__tests__/e2e_noAuth/index.test.ts | 30 ++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 35 +++++++++++++++++++ apps/api/src/controllers/crawl.ts | 6 ++++ apps/api/src/controllers/crawlPreview.ts | 6 ++++ apps/api/src/controllers/scrape.ts | 5 +++ .../src/scraper/WebScraper/utils/blocklist.ts | 19 ++++++++++ 6 files changed, 101 insertions(+) create mode 100644 apps/api/src/scraper/WebScraper/utils/blocklist.ts diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index e0aca36..f76a8dc 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -55,6 +55,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL without requiring authorization", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -70,6 +80,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -89,6 +109,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index ba01a7c..578a033 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -47,6 +47,18 @@ const TEST_URL = "http://127.0.0.1:3002"; .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response with a valid preview token", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -86,6 +98,17 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.statusCode).toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -99,6 +122,7 @@ const TEST_URL = "http://127.0.0.1:3002"; ); }); + // Additional tests for insufficient credits? }); @@ -119,6 +143,17 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.statusCode).toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index bd3feca..9301c4d 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -5,6 +5,7 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; export async function crawlController(req: Request, res: Response) { try { @@ -27,6 +28,11 @@ export async function crawlController(req: Request, res: Response) { if (!url) { return res.status(400).json({ error: "Url is required" }); } + + if (isUrlBlocked(url)) { + return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + } + const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 3f28ef6..4c40197 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -2,6 +2,7 @@ import { Request, Response } from "express"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -18,6 +19,11 @@ export async function crawlPreviewController(req: Request, res: Response) { if (!url) { return res.status(400).json({ error: "Url is required" }); } + + if (isUrlBlocked(url)) { + return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + } + const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index be70800..d24c882 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -5,6 +5,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function export async function scrapeHelper( req: Request, @@ -22,6 +23,10 @@ export async function scrapeHelper( return { success: false, error: "Url is required", returnCode: 400 }; } + if (isUrlBlocked(url)) { + return { success: false, error: "URL is blocked due to policy restrictions", returnCode: 403 }; + } + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts new file mode 100644 index 0000000..0eef332 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -0,0 +1,19 @@ +const socialMediaBlocklist = [ + 'facebook.com', + 'twitter.com', + 'instagram.com', + 'linkedin.com', + 'pinterest.com', + 'snapchat.com', + 'tiktok.com', + 'reddit.com', + 'tumblr.com', + 'flickr.com', + 'whatsapp.com', + 'wechat.com', + 'telegram.org', +]; + +export function isUrlBlocked(url: string): boolean { + return socialMediaBlocklist.some(domain => url.includes(domain)); +} From 0146157876b0f59690bde22df8b38a8730ce2742 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:28:32 -0700 Subject: [PATCH 089/102] Nick: mvp --- apps/api/src/controllers/search.ts | 136 ++++++++++++++++++ apps/api/src/lib/entities.ts | 2 + apps/api/src/routes/v0.ts | 5 + apps/api/src/scraper/WebScraper/single_url.ts | 11 +- .../src/scraper/WebScraper/utils/metadata.ts | 37 ++++- apps/api/src/search/googlesearch.ts | 134 +++++++++++++++++ apps/api/src/types.ts | 2 + 7 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 apps/api/src/controllers/search.ts create mode 100644 apps/api/src/search/googlesearch.ts diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts new file mode 100644 index 0000000..7cd5209 --- /dev/null +++ b/apps/api/src/controllers/search.ts @@ -0,0 +1,136 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../scraper/WebScraper"; +import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../types"; +import { logJob } from "../services/logging/log_job"; +import { PageOptions } from "../lib/entities"; +import { search } from "../search/googlesearch"; + +export async function searchHelper( + req: Request, + team_id: string, + crawlerOptions: any, + pageOptions: PageOptions +): Promise<{ + success: boolean; + error?: string; + data?: any; + returnCode: number; +}> { + const query = req.body.query; + if (!query) { + return { success: false, error: "Query is required", returnCode: 400 }; + } + + const res = await search(query, true, 7); + + let justSearch = pageOptions.fetchPageContent === false; + + if(justSearch){ + return { success: true, data: res, returnCode: 200 }; + } + + if (res.results.length === 0) { + return { success: true, error: "No search results found", returnCode: 200 }; + } + + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: res.results.map((r) => r.url), + crawlerOptions: { + ...crawlerOptions, + }, + pageOptions: {...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, fallback:false}, + }); + + const docs = await a.getDocuments(true); + if (docs.length === 0) + { + return { success: true, error: "No search results found", returnCode: 200 }; + } + + + // make sure doc.content is not empty + const filteredDocs = docs.filter( + (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 + ); + + if (filteredDocs.length === 0) { + return { success: true, error: "No page found", returnCode: 200 }; + } + + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, + }; + } + + return { + success: true, + data: filteredDocs, + returnCode: 200, + }; +} + +export async function searchController(req: Request, res: Response) { + try { + // make sure to authenticate user first, Bearer + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Search + ); + if (!success) { + return res.status(status).json({ error }); + } + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: true, fetchPageContent: true, fallback: false}; + const origin = req.body.origin ?? "api"; + + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + console.error(error); + return res.status(500).json({ error: "Internal server error" }); + } + const startTime = new Date().getTime(); + const result = await searchHelper( + req, + team_id, + crawlerOptions, + pageOptions + ); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + logJob({ + success: result.success, + message: result.error, + num_docs: 1, + docs: [result.data], + time_taken: timeTakenInSeconds, + team_id: team_id, + mode: "search", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + origin: origin, + }); + return res.status(result.returnCode).json(result); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index e261dd4..07f07e4 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -11,6 +11,8 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + fallback?: boolean; + fetchPageContent?: boolean; }; export type WebScraperOptions = { urls: string[]; diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index 023282a..f84b974 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -4,6 +4,7 @@ import { crawlStatusController } from "../../src/controllers/crawl-status"; import { scrapeController } from "../../src/controllers/scrape"; import { crawlPreviewController } from "../../src/controllers/crawlPreview"; import { crawlJobStatusPreviewController } from "../../src/controllers/status"; +import { searchController } from "../../src/controllers/search"; export const v0Router = express.Router(); @@ -12,3 +13,7 @@ v0Router.post("/v0/crawl", crawlController); v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController); v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController); + +// Search routes +v0Router.post("/v0/search", searchController); + diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0f3cc38..fcbb688 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,9 +4,7 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { parseTablesToMarkdown } from "./utils/parseTable"; import { excludeNonMainTags } from "./utils/excludeTags"; -// import puppeteer from "puppeteer"; dotenv.config(); @@ -155,6 +153,15 @@ export async function scrapSingleUrl( // } let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); + if(pageOptions.fallback === false){ + const soup = cheerio.load(html); + const metadata = extractMetadata(soup, urlToScrap); + return { + content: text, + markdown: text, + metadata: { ...metadata, sourceURL: urlToScrap }, + } as Document; + } if (!text || text.length < 100) { console.log("Falling back to playwright"); [text, html] = await attemptScraping(urlToScrap, "playwright"); diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index ef883c3..ddaf1e8 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -1,4 +1,3 @@ -// import * as cheerio from 'cheerio'; import { CheerioAPI } from "cheerio"; interface Metadata { title?: string; @@ -8,6 +7,14 @@ interface Metadata { robots?: string; ogTitle?: string; ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; dctermsCreated?: string; dcDateCreated?: string; dcDate?: string; @@ -17,7 +24,6 @@ interface Metadata { dctermsSubject?: string; dcSubject?: string; dcDescription?: string; - ogImage?: string; dctermsKeywords?: string; modifiedTime?: string; publishedTime?: string; @@ -33,6 +39,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let robots: string | null = null; let ogTitle: string | null = null; let ogDescription: string | null = null; + let ogUrl: string | null = null; + let ogImage: string | null = null; + let ogAudio: string | null = null; + let ogDeterminer: string | null = null; + let ogLocale: string | null = null; + let ogLocaleAlternate: string[] | null = null; + let ogSiteName: string | null = null; + let ogVideo: string | null = null; let dctermsCreated: string | null = null; let dcDateCreated: string | null = null; let dcDate: string | null = null; @@ -42,7 +56,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let dctermsSubject: string | null = null; let dcSubject: string | null = null; let dcDescription: string | null = null; - let ogImage: string | null = null; let dctermsKeywords: string | null = null; let modifiedTime: string | null = null; let publishedTime: string | null = null; @@ -62,11 +75,18 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { robots = soup('meta[name="robots"]').attr("content") || null; ogTitle = soup('meta[property="og:title"]').attr("content") || null; ogDescription = soup('meta[property="og:description"]').attr("content") || null; + ogUrl = soup('meta[property="og:url"]').attr("content") || null; + ogImage = soup('meta[property="og:image"]').attr("content") || null; + ogAudio = soup('meta[property="og:audio"]').attr("content") || null; + ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null; + ogLocale = soup('meta[property="og:locale"]').attr("content") || null; + ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null; + ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null; + ogVideo = soup('meta[property="og:video"]').attr("content") || null; articleSection = soup('meta[name="article:section"]').attr("content") || null; articleTag = soup('meta[name="article:tag"]').attr("content") || null; publishedTime = soup('meta[property="article:published_time"]').attr("content") || null; modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null; - ogImage = soup('meta[property="og:image"]').attr("content") || null; dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null; dcDescription = soup('meta[name="dc.description"]').attr("content") || null; dcSubject = soup('meta[name="dc.subject"]').attr("content") || null; @@ -90,6 +110,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(robots ? { robots } : {}), ...(ogTitle ? { ogTitle } : {}), ...(ogDescription ? { ogDescription } : {}), + ...(ogUrl ? { ogUrl } : {}), + ...(ogImage ? { ogImage } : {}), + ...(ogAudio ? { ogAudio } : {}), + ...(ogDeterminer ? { ogDeterminer } : {}), + ...(ogLocale ? { ogLocale } : {}), + ...(ogLocaleAlternate ? { ogLocaleAlternate } : {}), + ...(ogSiteName ? { ogSiteName } : {}), + ...(ogVideo ? { ogVideo } : {}), ...(dctermsCreated ? { dctermsCreated } : {}), ...(dcDateCreated ? { dcDateCreated } : {}), ...(dcDate ? { dcDate } : {}), @@ -99,7 +127,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(dctermsSubject ? { dctermsSubject } : {}), ...(dcSubject ? { dcSubject } : {}), ...(dcDescription ? { dcDescription } : {}), - ...(ogImage ? { ogImage } : {}), ...(dctermsKeywords ? { dctermsKeywords } : {}), ...(modifiedTime ? { modifiedTime } : {}), ...(publishedTime ? { publishedTime } : {}), diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts new file mode 100644 index 0000000..fd3b645 --- /dev/null +++ b/apps/api/src/search/googlesearch.ts @@ -0,0 +1,134 @@ +import axios from 'axios'; +import * as cheerio from 'cheerio'; +import * as querystring from 'querystring'; +import { ScrapingBeeClient } from 'scrapingbee'; + +const _useragent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' +]; + +function get_useragent(): string { + return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; +} + +async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) { + const resp = await axios.get("https://www.google.com/search", { + headers: { + "User-Agent": get_useragent() + }, + params: { + "q": term, + "num": results + 2, // Prevents multiple requests + "hl": lang, + }, + proxy: proxies, + timeout: timeout, + }); + return resp; +} + +class SearchResult { + url: string; + title: string; + description: string; + + constructor(url: string, title: string, description: string) { + this.url = url; + this.title = title; + this.description = description; + } + + toString(): string { + return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; + } +} + +export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) { + const escaped_term = querystring.escape(term); + + let proxies = null; + if (proxy) { + if (proxy.startsWith("https")) { + proxies = {"https": proxy}; + } else { + proxies = {"http": proxy}; + } + } + + // const response = await _req_scraping_bee(escaped_term, num_results, lang); + // const $ = cheerio.load(response); + + // const knowledgeGraphElement = $("div.kno-rdesc"); + // console.log(knowledgeGraphElement); + // console.log(knowledgeGraphElement.html()); + + // let knowledgeGraph = null; + // if (knowledgeGraphElement.length > 0) { + // console.log("Knowledge Graph found"); + // const title = knowledgeGraphElement.find("h2").text(); + // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); + // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); + // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); + // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); + // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); + // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); + // const attributes = {}; + // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { + // const attributeKey = $(element).find("span[data-attrid]").text(); + // const attributeValue = $(element).find("span[data-log-string]").text(); + // attributes[attributeKey] = attributeValue; + // }); + // knowledgeGraph = { + // "title": title, + // "type": type, + // "website": website, + // "imageUrl": imageUrl, + // "description": description, + // "descriptionSource": descriptionSource, + // "descriptionLink": descriptionLink, + // "attributes": attributes + // }; + // } + + let start = 0; + let results = []; + while (start < num_results) { + const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout); + const $ = cheerio.load(resp.data); + const result_block = $("div.g"); + if (result_block.length === 0) { + start += 1; + } + result_block.each((index, element) => { + const linkElement = $(element).find("a"); + const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; + const title = $(element).find("h3"); + const ogImage = $(element).find("img").eq(1).attr("src"); + const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); + const answerBox = $(element).find(".mod").text(); + if (description_box) { + const description = description_box.text(); + if (link && title && description) { + start += 1; + if (advanced) { + results.push(new SearchResult(link, title.text(), description)); + } else { + results.push(link); + } + } + } + }); + await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + + if (start === 0) { + return {results: []}; + } + } + return {results: results}; +} diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 5d778a2..c65140c 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -44,6 +44,8 @@ export enum RateLimiterMode { CrawlStatus = "crawl-status", Scrape = "scrape", Preview = "preview", + Search = "search", + } export interface AuthResponse { From 5e3e2ec966e4c28120f52c037a9df8e93c58ff9b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:44:11 -0700 Subject: [PATCH 090/102] Nick: --- apps/api/src/controllers/search.ts | 59 ++++++++++++++++++------------ apps/api/src/lib/entities.ts | 5 +++ 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7cd5209..bc6659b 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -4,14 +4,15 @@ import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; -import { PageOptions } from "../lib/entities"; +import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search/googlesearch"; export async function searchHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: PageOptions + pageOptions: PageOptions, + searchOptions: SearchOptions ): Promise<{ success: boolean; error?: string; @@ -19,39 +20,44 @@ export async function searchHelper( returnCode: number; }> { const query = req.body.query; + const advanced = false; if (!query) { return { success: false, error: "Query is required", returnCode: 400 }; } - const res = await search(query, true, 7); + const res = await search(query, advanced, searchOptions.limit ?? 7); let justSearch = pageOptions.fetchPageContent === false; - if(justSearch){ + if (justSearch) { return { success: true, data: res, returnCode: 200 }; } if (res.results.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } + console.log(res.results); const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", - urls: res.results.map((r) => r.url), + urls: res.results.map((r) => (!advanced ? r : r.url)), crawlerOptions: { ...crawlerOptions, }, - pageOptions: {...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, fallback:false}, + pageOptions: { + ...pageOptions, + onlyMainContent: pageOptions?.onlyMainContent ?? true, + fetchPageContent: pageOptions?.fetchPageContent ?? true, + fallback: false, + }, }); const docs = await a.getDocuments(true); - if (docs.length === 0) - { + if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } - // make sure doc.content is not empty const filteredDocs = docs.filter( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 @@ -61,18 +67,18 @@ export async function searchHelper( return { success: true, error: "No page found", returnCode: 200 }; } - const { success, credit_usage } = await billTeam( - team_id, - filteredDocs.length - ); - if (!success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; - } + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, + }; + } return { success: true, @@ -93,9 +99,15 @@ export async function searchController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: true, fetchPageContent: true, fallback: false}; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: true, + fetchPageContent: true, + fallback: false, + }; const origin = req.body.origin ?? "api"; + const searchOptions = req.body.searchOptions ?? { limit: 7 }; + try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); @@ -111,7 +123,8 @@ export async function searchController(req: Request, res: Response) { req, team_id, crawlerOptions, - pageOptions + pageOptions, + searchOptions ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 07f07e4..b4b5193 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -14,6 +14,11 @@ export type PageOptions = { fallback?: boolean; fetchPageContent?: boolean; }; + +export type SearchOptions = { + limit?: number; +}; + export type WebScraperOptions = { urls: string[]; mode: "single_urls" | "sitemap" | "crawl"; From 495adc9a3f3b056b84abe101bb5633bb783d410d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:48:37 -0700 Subject: [PATCH 091/102] Update googlesearch.ts --- apps/api/src/search/googlesearch.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index fd3b645..c63c907 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -24,7 +24,7 @@ async function _req(term: string, results: number, lang: string, start: number, }, params: { "q": term, - "num": results + 2, // Prevents multiple requests + "num": results, // Number of results to return "hl": lang, }, proxy: proxies, From 841279c74d96b87aac989b795d062eb83e9cdda9 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:49:00 -0700 Subject: [PATCH 092/102] Update README.md Added a reminder to star the repo with a graphic. --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f6b67b7..290ed9b 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,17 @@ Crawl and convert any website into LLM-ready markdown. Build by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) - *This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* ## What is Firecrawl? [Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown. We crawl all accessible subpages and give you clean markdown for each. No sitemap required. +_Pst. hey, you, join our stargazers :)_ + + + + ## How to use it? We provide an easy to use API with our hosted version. You can find the playground and documentation [here](https://firecrawl.dev/playground). You can also self host the backend if you'd like. From 8cb5d7955a36aec3f87ea91791cbfac51f4b6070 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:49:05 -0700 Subject: [PATCH 093/102] Update googlesearch.ts --- apps/api/src/search/googlesearch.ts | 71 +++++++++++++++-------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index c63c907..c835d08 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -61,40 +61,7 @@ export async function search(term: string, advanced = false, num_results = 7, la } } - // const response = await _req_scraping_bee(escaped_term, num_results, lang); - // const $ = cheerio.load(response); - - // const knowledgeGraphElement = $("div.kno-rdesc"); - // console.log(knowledgeGraphElement); - // console.log(knowledgeGraphElement.html()); - - // let knowledgeGraph = null; - // if (knowledgeGraphElement.length > 0) { - // console.log("Knowledge Graph found"); - // const title = knowledgeGraphElement.find("h2").text(); - // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); - // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); - // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); - // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); - // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); - // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); - // const attributes = {}; - // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { - // const attributeKey = $(element).find("span[data-attrid]").text(); - // const attributeValue = $(element).find("span[data-log-string]").text(); - // attributes[attributeKey] = attributeValue; - // }); - // knowledgeGraph = { - // "title": title, - // "type": type, - // "website": website, - // "imageUrl": imageUrl, - // "description": description, - // "descriptionSource": descriptionSource, - // "descriptionLink": descriptionLink, - // "attributes": attributes - // }; - // } + // TODO: knowledge graph, answer box, etc. let start = 0; let results = []; @@ -132,3 +99,39 @@ export async function search(term: string, advanced = false, num_results = 7, la } return {results: results}; } + + +// const response = await _req_scraping_bee(escaped_term, num_results, lang); + // const $ = cheerio.load(response); + + // const knowledgeGraphElement = $("div.kno-rdesc"); + // console.log(knowledgeGraphElement); + // console.log(knowledgeGraphElement.html()); + + // let knowledgeGraph = null; + // if (knowledgeGraphElement.length > 0) { + // console.log("Knowledge Graph found"); + // const title = knowledgeGraphElement.find("h2").text(); + // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); + // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); + // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); + // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); + // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); + // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); + // const attributes = {}; + // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { + // const attributeKey = $(element).find("span[data-attrid]").text(); + // const attributeValue = $(element).find("span[data-log-string]").text(); + // attributes[attributeKey] = attributeValue; + // }); + // knowledgeGraph = { + // "title": title, + // "type": type, + // "website": website, + // "imageUrl": imageUrl, + // "description": description, + // "descriptionSource": descriptionSource, + // "descriptionLink": descriptionLink, + // "attributes": attributes + // }; + // } \ No newline at end of file From 41263bb4b6deb17042d64ea34cab72159e1340dc Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:45:06 -0700 Subject: [PATCH 094/102] Nick: serper support --- apps/api/.env.example | 3 +- apps/api/src/controllers/search.ts | 12 ++- apps/api/src/lib/entities.ts | 3 + apps/api/src/search/googlesearch.ts | 152 +++++++++++++--------------- apps/api/src/search/index.ts | 45 ++++++++ apps/api/src/search/serper.ts | 27 +++++ 6 files changed, 157 insertions(+), 85 deletions(-) create mode 100644 apps/api/src/search/index.ts create mode 100644 apps/api/src/search/serper.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index 34e24b1..3bd06cd 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -21,4 +21,5 @@ OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback -LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs \ No newline at end of file +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api \ No newline at end of file diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index bc6659b..6a1c7b4 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -5,7 +5,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; -import { search } from "../search/googlesearch"; +import { search } from "../search"; export async function searchHelper( req: Request, @@ -25,7 +25,10 @@ export async function searchHelper( return { success: false, error: "Query is required", returnCode: 400 }; } - const res = await search(query, advanced, searchOptions.limit ?? 7); + const tbs = searchOptions.tbs ?? null; + const filter = searchOptions.filter ?? null; + + const res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter}); let justSearch = pageOptions.fetchPageContent === false; @@ -33,15 +36,14 @@ export async function searchHelper( return { success: true, data: res, returnCode: 200 }; } - if (res.results.length === 0) { + if (res.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } - console.log(res.results); const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", - urls: res.results.map((r) => (!advanced ? r : r.url)), + urls: res.map((r) => r), crawlerOptions: { ...crawlerOptions, }, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index b4b5193..062212b 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -13,10 +13,13 @@ export type PageOptions = { onlyMainContent?: boolean; fallback?: boolean; fetchPageContent?: boolean; + }; export type SearchOptions = { limit?: number; + tbs?: string; + filter?: string; }; export type WebScraperOptions = { diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index c835d08..53227e6 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -1,7 +1,6 @@ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as querystring from 'querystring'; -import { ScrapingBeeClient } from 'scrapingbee'; const _useragent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', @@ -17,20 +16,35 @@ function get_useragent(): string { return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; } -async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) { - const resp = await axios.get("https://www.google.com/search", { - headers: { - "User-Agent": get_useragent() - }, - params: { - "q": term, - "num": results, // Number of results to return - "hl": lang, - }, - proxy: proxies, - timeout: timeout, - }); - return resp; +async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) { + const params = { + "q": term, + "num": results, // Number of results to return + "hl": lang, + "start": start, + }; + if (tbs) { + params["tbs"] = tbs; + } + if (filter) { + params["filter"] = filter; + } + try { + const resp = await axios.get("https://www.google.com/search", { + headers: { + "User-Agent": get_useragent() + }, + params: params, + proxy: proxies, + timeout: timeout, + }); + return resp; + } catch (error) { + if (error.response && error.response.status === 429) { + throw new Error('Google Search: Too many requests, try again later.'); + } + throw error; + } } class SearchResult { @@ -49,7 +63,7 @@ class SearchResult { } } -export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) { +export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { const escaped_term = querystring.escape(term); let proxies = null; @@ -64,74 +78,54 @@ export async function search(term: string, advanced = false, num_results = 7, la // TODO: knowledge graph, answer box, etc. let start = 0; - let results = []; - while (start < num_results) { - const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout); - const $ = cheerio.load(resp.data); - const result_block = $("div.g"); - if (result_block.length === 0) { - start += 1; - } - result_block.each((index, element) => { - const linkElement = $(element).find("a"); - const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; - const title = $(element).find("h3"); - const ogImage = $(element).find("img").eq(1).attr("src"); - const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); - const answerBox = $(element).find(".mod").text(); - if (description_box) { - const description = description_box.text(); - if (link && title && description) { - start += 1; - if (advanced) { - results.push(new SearchResult(link, title.text(), description)); - } else { - results.push(link); + let results : string[] = []; + let attempts = 0; + const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop + while (start < num_results && attempts < maxAttempts) { + try { + const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout, tbs, filter); + const $ = cheerio.load(resp.data); + const result_block = $("div.g"); + if (result_block.length === 0) { + start += 1; + attempts += 1; + } else { + attempts = 0; // Reset attempts if we have results + } + result_block.each((index, element) => { + const linkElement = $(element).find("a"); + const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; + const title = $(element).find("h3"); + const ogImage = $(element).find("img").eq(1).attr("src"); + const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); + const answerBox = $(element).find(".mod").text(); + if (description_box) { + const description = description_box.text(); + if (link && title && description) { + start += 1; + if (advanced) { + // results.push(new SearchResult(link, title.text(), description)); + } else { + results.push(link); + } } } + }); + await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + } catch (error) { + if (error.message === 'Too many requests') { + console.warn('Too many requests, breaking the loop'); + break; } - }); - await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + throw error; + } if (start === 0) { - return {results: []}; + return results; } } - return {results: results}; + if (attempts >= maxAttempts) { + console.warn('Max attempts reached, breaking the loop'); + } + return results } - - -// const response = await _req_scraping_bee(escaped_term, num_results, lang); - // const $ = cheerio.load(response); - - // const knowledgeGraphElement = $("div.kno-rdesc"); - // console.log(knowledgeGraphElement); - // console.log(knowledgeGraphElement.html()); - - // let knowledgeGraph = null; - // if (knowledgeGraphElement.length > 0) { - // console.log("Knowledge Graph found"); - // const title = knowledgeGraphElement.find("h2").text(); - // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); - // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); - // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); - // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); - // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); - // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); - // const attributes = {}; - // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { - // const attributeKey = $(element).find("span[data-attrid]").text(); - // const attributeValue = $(element).find("span[data-log-string]").text(); - // attributes[attributeKey] = attributeValue; - // }); - // knowledgeGraph = { - // "title": title, - // "type": type, - // "website": website, - // "imageUrl": imageUrl, - // "description": description, - // "descriptionSource": descriptionSource, - // "descriptionLink": descriptionLink, - // "attributes": attributes - // }; - // } \ No newline at end of file diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts new file mode 100644 index 0000000..0f3a596 --- /dev/null +++ b/apps/api/src/search/index.ts @@ -0,0 +1,45 @@ +import { google_search } from "./googlesearch"; +import { serper_search } from "./serper"; + +export async function search({ + query, + advanced = false, + num_results = 7, + tbs = null, + filter = null, + lang = "en", + proxy = null, + sleep_interval = 0, + timeout = 5000, +}: { + query: string; + advanced?: boolean; + num_results?: number; + tbs?: string; + filter?: string; + lang?: string; + proxy?: string; + sleep_interval?: number; + timeout?: number; +}) { + try { + if (process.env.SERPER_API_KEY) { + return await serper_search(query, num_results); + } + return await google_search( + query, + advanced, + num_results, + tbs, + filter, + lang, + proxy, + sleep_interval, + timeout + ); + } catch (error) { + console.error("Error in search function: ", error); + return [] + } + // if process.env.SERPER_API_KEY is set, use serper +} diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/serper.ts new file mode 100644 index 0000000..f92f2fc --- /dev/null +++ b/apps/api/src/search/serper.ts @@ -0,0 +1,27 @@ +import axios from "axios"; +import dotenv from "dotenv"; + +dotenv.config(); + +export async function serper_search(q, num_results) : Promise { + let data = JSON.stringify({ + q: q, + "num": num_results + }); + + let config = { + method: "POST", + url: "https://google.serper.dev/search", + headers: { + "X-API-KEY": process.env.SERPER_API_KEY, + "Content-Type": "application/json", + }, + data: data, + }; + const response = await axios(config); + if (response && response.data && Array.isArray(response.data.organic)) { + return response.data.organic.map((a) => a.link); + } else { + return []; + } +} From f3c190c21ced7b87989abbbb4e7180653c820aad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:47:24 -0700 Subject: [PATCH 095/102] Nick: --- apps/api/src/__tests__/e2e_noAuth/index.test.ts | 6 +++--- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++--- apps/api/src/controllers/crawl.ts | 2 +- apps/api/src/controllers/crawlPreview.ts | 2 +- apps/api/src/controllers/scrape.ts | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index f76a8dc..b2b2938 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -62,7 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response", async () => { @@ -87,7 +87,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response", async () => { @@ -116,7 +116,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response", async () => { diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 578a033..a165ae2 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -56,7 +56,7 @@ const TEST_URL = "http://127.0.0.1:3002"; .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response with a valid preview token", async () => { @@ -106,7 +106,7 @@ const TEST_URL = "http://127.0.0.1:3002"; .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response with a valid API key", async () => { @@ -151,7 +151,7 @@ const TEST_URL = "http://127.0.0.1:3002"; .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response with a valid API key", async () => { diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 9301c4d..3d64f7f 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -30,7 +30,7 @@ export async function crawlController(req: Request, res: Response) { } if (isUrlBlocked(url)) { - return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); } const mode = req.body.mode ?? "crawl"; diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 4c40197..569be33 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -21,7 +21,7 @@ export async function crawlPreviewController(req: Request, res: Response) { } if (isUrlBlocked(url)) { - return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); } const mode = req.body.mode ?? "crawl"; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d24c882..cfe35b5 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -24,7 +24,7 @@ export async function scrapeHelper( } if (isUrlBlocked(url)) { - return { success: false, error: "URL is blocked due to policy restrictions", returnCode: 403 }; + return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } const a = new WebScraperDataProvider(); From e6779aff6824282c2cfdeaaa016a0f3512202216 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:56:09 -0700 Subject: [PATCH 096/102] Nick: tests --- .../src/__tests__/e2e_noAuth/index.test.ts | 27 ++++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 28 +++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index e0aca36..dfe6aeb 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -102,6 +102,33 @@ describe("E2E Tests for API Routes with No Authentication", () => { }); }); + describe("POST /v0/search", () => { + it("should require not authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return no error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }); + }); + describe("GET /v0/crawl/status/:jobId", () => { it("should not require authorization", async () => { const response = await request(TEST_URL).get("/v0/crawl/status/123"); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index ba01a7c..f0887eb 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -133,6 +133,34 @@ const TEST_URL = "http://127.0.0.1:3002"; }); }); + describe("POST /v0/search", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, 20000); + }); + describe("GET /v0/crawl/status/:jobId", () => { it("should require authorization", async () => { const response = await request(TEST_URL).get("/v0/crawl/status/123"); From 4328a68ec19049caba40ffdb3d442ba915483454 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:57:53 -0700 Subject: [PATCH 097/102] Nick: --- apps/api/src/__tests__/e2e_noAuth/index.test.ts | 4 ++-- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index dfe6aeb..37eeb0e 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -117,7 +117,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); - it("should return a successful response with a valid API key", async () => { + it("should return a successful response without a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/search") .set("Content-Type", "application/json") @@ -126,7 +126,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.body).toHaveProperty("success"); expect(response.body.success).toBe(true); expect(response.body).toHaveProperty("data"); - }); + }, 20000); }); describe("GET /v0/crawl/status/:jobId", () => { diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index f0887eb..59dfde2 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -158,7 +158,7 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.body).toHaveProperty("success"); expect(response.body.success).toBe(true); expect(response.body).toHaveProperty("data"); - }, 20000); + }, 20000); }); describe("GET /v0/crawl/status/:jobId", () => { From f0695c712307b06bde55e251f799373882b6a7ad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:04:10 -0700 Subject: [PATCH 098/102] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fcbb688..e110b0e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -23,13 +23,14 @@ export async function scrapWithCustomFirecrawl( export async function scrapWithScrapingBee( url: string, - wait_browser: string = "domcontentloaded" + wait_browser: string = "domcontentloaded", + timeout: number = 15000 ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const response = await client.get({ url: url, - params: { timeout: 15000, wait_browser: wait_browser }, + params: { timeout: timeout, wait_browser: wait_browser }, headers: { "ScrapingService-Request": "TRUE" }, }); @@ -106,11 +107,11 @@ export async function scrapSingleUrl( let text = ""; switch (method) { case "firecrawl-scraper": - text = await scrapWithCustomFirecrawl(url); + text = await scrapWithCustomFirecrawl(url,); break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee(url); + text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000); } break; case "playwright": From 53cc4c396fea229ac87004e822f2228a090feb5c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:05:58 -0700 Subject: [PATCH 099/102] Update search.ts --- apps/api/src/controllers/search.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 6a1c7b4..4c03644 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -6,6 +6,7 @@ import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; export async function searchHelper( req: Request, @@ -28,7 +29,7 @@ export async function searchHelper( const tbs = searchOptions.tbs ?? null; const filter = searchOptions.filter ?? null; - const res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter}); + let res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter}); let justSearch = pageOptions.fetchPageContent === false; @@ -40,6 +41,9 @@ export async function searchHelper( return { success: true, error: "No search results found", returnCode: 200 }; } + // filter out social media links + res = res.filter((r) => !isUrlBlocked(r)); + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", From 3abfd6b4c19d9ce14c6a5b8dea47dda16f6383d0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:06:48 -0700 Subject: [PATCH 100/102] Update search.ts --- apps/api/src/controllers/search.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 4c03644..f18f1c5 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -37,12 +37,13 @@ export async function searchHelper( return { success: true, data: res, returnCode: 200 }; } + res = res.filter((r) => !isUrlBlocked(r)); + if (res.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } // filter out social media links - res = res.filter((r) => !isUrlBlocked(r)); const a = new WebScraperDataProvider(); await a.setOptions({ From fdb2789eaa302b2f90bed7f1dad6dcc95613cb1f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:14:34 -0700 Subject: [PATCH 101/102] Nick: added url as return param --- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 2 ++ 2 files changed, 3 insertions(+) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 062212b..fdc1c61 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -40,6 +40,7 @@ export type WebScraperOptions = { export class Document { id?: string; + url?: string; // Used only in /search for now content: string; markdown?: string; createdAt?: Date; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index e110b0e..6ab3003 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -154,10 +154,12 @@ export async function scrapSingleUrl( // } let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); + // Basically means that it is using /search endpoint if(pageOptions.fallback === false){ const soup = cheerio.load(html); const metadata = extractMetadata(soup, urlToScrap); return { + url: urlToScrap, content: text, markdown: text, metadata: { ...metadata, sourceURL: urlToScrap }, From 479fa2f7f8862e6e69b8a2f47a928ddc1cf0808c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:46:32 -0700 Subject: [PATCH 102/102] Nick: --- apps/api/src/search/index.ts | 2 +- apps/api/src/search/serper.ts | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 0f3a596..ae62451 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -23,7 +23,7 @@ export async function search({ timeout?: number; }) { try { - if (process.env.SERPER_API_KEY) { + if (process.env.SERPER_API_KEY && !tbs) { return await serper_search(query, num_results); } return await google_search( diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/serper.ts index f92f2fc..2b4ba02 100644 --- a/apps/api/src/search/serper.ts +++ b/apps/api/src/search/serper.ts @@ -6,7 +6,8 @@ dotenv.config(); export async function serper_search(q, num_results) : Promise { let data = JSON.stringify({ q: q, - "num": num_results + "num": num_results, + }); let config = {