diff --git a/.github/workflows/js-sdk.yml b/.github/archive/js-sdk.yml similarity index 100% rename from .github/workflows/js-sdk.yml rename to .github/archive/js-sdk.yml diff --git a/.github/workflows/publish-js-sdk.yml b/.github/archive/publish-js-sdk.yml similarity index 100% rename from .github/workflows/publish-js-sdk.yml rename to .github/archive/publish-js-sdk.yml diff --git a/.github/workflows/publish-python-sdk.yml b/.github/archive/publish-python-sdk.yml similarity index 100% rename from .github/workflows/publish-python-sdk.yml rename to .github/archive/publish-python-sdk.yml diff --git a/.github/workflows/python-sdk.yml b/.github/archive/python-sdk.yml similarity index 100% rename from .github/workflows/python-sdk.yml rename to .github/archive/python-sdk.yml diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml new file mode 100644 index 0000000..2ced537 --- /dev/null +++ b/.github/workflows/clean-before-24h-complete-jobs.yml @@ -0,0 +1,20 @@ +name: Clean Before 24h Completed Jobs +on: + schedule: + - cron: '0 0 * * *' + +env: + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + +jobs: + clean-jobs: + runs-on: ubuntu-latest + steps: + - name: Send GET request to clean jobs + run: | + response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs) + if [ "$response" -ne 200 ]; then + echo "Failed to clean jobs. Response: $response" + exit 1 + fi + echo "Successfully cleaned jobs. Response: $response" diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 77b3fd3..84017b1 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -183,6 +183,7 @@ jobs: FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} build-and-publish-python-sdk: + name: Build and publish Python SDK runs-on: ubuntu-latest needs: deploy @@ -213,7 +214,7 @@ jobs: working-directory: ./apps/python-sdk - name: Publish to PyPI - if: ${{ env.VERSION_INCREMENTED == 'true' }} + if: ${{ env.PYTHON_SDK_VERSION_INCREMENTED == 'true' }} env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} @@ -222,6 +223,7 @@ jobs: working-directory: ./apps/python-sdk build-and-publish-js-sdk: + name: Build and publish JavaScript SDK runs-on: ubuntu-latest needs: deploy diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 6bc8266..468695d 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -54,7 +54,7 @@ kill_timeout = '5s' soft_limit = 12 [[vm]] - size = 'performance-8x' + size = 'performance-4x' processes = ['app'] diff --git a/apps/api/openapi.json b/apps/api/openapi.json index ab452ff..b07e43f 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -51,10 +51,26 @@ "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 + }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." } } }, @@ -176,10 +192,20 @@ "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", "default": "default" }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling", + "default": false + }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", "default": 10000 + }, + "allowBackwardCrawling": { + "type": "boolean", + "description": "Allow backward crawling (crawl from the base URL to the previous URLs)", + "default": false } } }, @@ -195,6 +221,27 @@ "type": "boolean", "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false + }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, + "headers": { + "type": "object", + "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." + }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false } } } @@ -368,7 +415,7 @@ "items": { "$ref": "#/components/schemas/CrawlStatusResponseObj" }, - "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled." + "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." } } } @@ -513,6 +560,10 @@ "nullable": true, "description": "Raw HTML content of the page if `includeHtml` is true" }, + "index": { + "type": "integer", + "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + }, "metadata": { "type": "object", "properties": { diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index c443e71..acb2278 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -1,5 +1,4 @@ import request from "supertest"; -import { app } from "../../index"; import dotenv from "dotenv"; const fs = require("fs"); const path = require("path"); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index f619254..d6b2b53 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,5 +1,4 @@ import request from "supertest"; -import { app } from "../../index"; import dotenv from "dotenv"; import { v4 as uuidv4 } from "uuid"; @@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/scrape", () => { it.concurrent("should require authorization", async () => { - const response = await request(app).post("/v0/scrape"); + const response = await request(TEST_URL).post("/v0/scrape"); expect(response.statusCode).toBe(401); }); @@ -136,6 +135,40 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 60000); // 60 seconds + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, 30000); // 30 seconds timeout + // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // const startTime = Date.now(); @@ -596,7 +629,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://roastmywebsite.ai" }); + .send({ url: "https://mendable.ai/blog" }); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; @@ -622,7 +655,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + + const childrenLinks = completedResponse.body.data.filter(doc => + doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + ); + + expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, 120000); // 120 seconds it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { @@ -757,34 +796,82 @@ describe("E2E Tests for API Routes", () => { }, 60000); }); // 60 seconds + it.concurrent("should return a successful response for a valid crawl job with allowBackwardCrawling set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai/blog", + pageOptions: { includeHtml: true }, + crawlerOptions: { allowBackwardCrawling: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + let isFinished = false; + let completedResponse; + + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].markdown).toContain("Mendable"); + + const onlyChildrenLinks = completedResponse.body.data.filter(doc => { + return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + }); + + expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); + }, 60000); + it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ url: "https://jestjs.io" }); + expect(crawlResponse.statusCode).toBe(200); - // wait for 30 seconds await new Promise((r) => setTimeout(r, 20000)); - const response = await request(TEST_URL) + const responseCancel = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("cancelled"); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("failed"); expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data).toEqual(null); + expect(completedResponse.body.data).toBeNull(); expect(completedResponse.body).toHaveProperty("partial_data"); expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 2c4b0c7..ea789fe 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -143,7 +143,7 @@ export async function supaAuthenticateUser( const startDate = new Date(); const endDate = new Date(); endDate.setDate(endDate.getDate() + 7); - await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString()); + // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString()); return { success: false, error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`, diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 5345b4f..7eab78f 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const crawlerOptions = req.body.crawlerOptions ?? { + allowBackwardCrawling: false + }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + removeTags: [] + }; if (mode === "single_urls" && !url.includes(",")) { try { @@ -64,9 +70,7 @@ export async function crawlController(req: Request, res: Response) { await a.setOptions({ mode: "single_urls", urls: [url], - crawlerOptions: { - returnOnlyUrls: true, - }, + crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, pageOptions: pageOptions, }); @@ -91,7 +95,7 @@ export async function crawlController(req: Request, res: Response) { const job = await addWebScraperJob({ url: url, mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions }, + crawlerOptions: crawlerOptions, team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index d3e9afe..2c3dc4e 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7474aae..abbc357 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -85,6 +85,7 @@ export async function searchHelper( onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, includeHtml: pageOptions?.includeHtml ?? false, + removeTags: pageOptions?.removeTags ?? [], fallback: false, }, }); @@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) { includeHtml: false, onlyMainContent: true, fetchPageContent: true, + removeTags: [], fallback: false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0246a1e..494b4d5 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -5,169 +5,215 @@ import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; -import { initSDK } from '@hyperdx/node-opentelemetry'; +import { initSDK } from "@hyperdx/node-opentelemetry"; +import cluster from "cluster"; +import os from "os"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); -export const app = express(); +const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; +console.log(`Number of CPUs: ${numCPUs} available`); -global.isProduction = process.env.IS_PRODUCTION === "true"; +if (cluster.isMaster) { + console.log(`Master ${process.pid} is running`); -app.use(bodyParser.urlencoded({ extended: true })); -app.use(bodyParser.json({ limit: "10mb" })); + // Fork workers. + for (let i = 0; i < numCPUs; i++) { + cluster.fork(); + } -app.use(cors()); // Add this line to enable CORS - -const serverAdapter = new ExpressAdapter(); -serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); - -const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ - queues: [new BullAdapter(getWebScraperQueue())], - serverAdapter: serverAdapter, -}); - -app.use( - `/admin/${process.env.BULL_AUTH_KEY}/queues`, - serverAdapter.getRouter() -); - -app.get("/", (req, res) => { - res.send("SCRAPERS-JS: Hello, world! Fly.io"); -}); - -//write a simple test function -app.get("/test", async (req, res) => { - res.send("Hello, world!"); -}); - -// register router -app.use(v0Router); - -const DEFAULT_PORT = process.env.PORT ?? 3002; -const HOST = process.env.HOST ?? "localhost"; -redisClient.connect(); - -// HyperDX OpenTelemetry -if(process.env.ENV === 'production') { - initSDK({ consoleCapture: true, additionalInstrumentations: []}); -} - - -export function startServer(port = DEFAULT_PORT) { - const server = app.listen(Number(port), HOST, () => { - console.log(`Server listening on port ${port}`); - console.log( - `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - ); - console.log(""); - console.log("1. Make sure Redis is running on port 6379 by default"); - console.log( - "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " - ); + cluster.on("exit", (worker, code, signal) => { + console.log(`Worker ${worker.process.pid} exited`); + console.log("Starting a new worker"); + cluster.fork(); }); - return server; -} +} else { + const app = express(); -if (require.main === module) { - startServer(); -} + global.isProduction = process.env.IS_PRODUCTION === "true"; -// Use this as a "health check" that way we dont destroy the server -app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const [webScraperActive] = await Promise.all([ - webScraperQueue.getActiveCount(), - ]); + app.use(bodyParser.urlencoded({ extended: true })); + app.use(bodyParser.json({ limit: "10mb" })); - const noActiveJobs = webScraperActive === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noActiveJobs ? 200 : 500).json({ - webScraperActive, - noActiveJobs, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); + app.use(cors()); // Add this line to enable CORS + + const serverAdapter = new ExpressAdapter(); + serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); + + const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ + queues: [new BullAdapter(getWebScraperQueue())], + serverAdapter: serverAdapter, + }); + + app.use( + `/admin/${process.env.BULL_AUTH_KEY}/queues`, + serverAdapter.getRouter() + ); + + app.get("/", (req, res) => { + res.send("SCRAPERS-JS: Hello, world! Fly.io"); + }); + + //write a simple test function + app.get("/test", async (req, res) => { + res.send("Hello, world!"); + }); + + // register router + app.use(v0Router); + + const DEFAULT_PORT = process.env.PORT ?? 3002; + const HOST = process.env.HOST ?? "localhost"; + redisClient.connect(); + + // HyperDX OpenTelemetry + if (process.env.ENV === "production") { + initSDK({ consoleCapture: true, additionalInstrumentations: [] }); } -}); -app.get(`/serverHealthCheck`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const [waitingJobs] = await Promise.all([ - webScraperQueue.getWaitingCount(), - ]); - - const noWaitingJobs = waitingJobs === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noWaitingJobs ? 200 : 500).json({ - waitingJobs, + function startServer(port = DEFAULT_PORT) { + const server = app.listen(Number(port), HOST, () => { + console.log(`Worker ${process.pid} listening on port ${port}`); + console.log( + `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); + console.log(""); + console.log("1. Make sure Redis is running on port 6379 by default"); + console.log( + "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " + ); }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); + return server; } -}); -app.get('/serverHealthCheck/notify', async (req, res) => { - if (process.env.SLACK_WEBHOOK_URL) { - const treshold = 1; // The treshold value for the active jobs - const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds + if (require.main === module) { + startServer(); + } - const getWaitingJobsCount = async () => { + // Use this as a "health check" that way we dont destroy the server + app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { + try { const webScraperQueue = getWebScraperQueue(); - const [waitingJobsCount] = await Promise.all([ + const [webScraperActive] = await Promise.all([ + webScraperQueue.getActiveCount(), + ]); + + const noActiveJobs = webScraperActive === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noActiveJobs ? 200 : 500).json({ + webScraperActive, + noActiveJobs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + }); + + app.get(`/serverHealthCheck`, async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const [waitingJobs] = await Promise.all([ webScraperQueue.getWaitingCount(), ]); - return waitingJobsCount; - }; + const noWaitingJobs = waitingJobs === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noWaitingJobs ? 200 : 500).json({ + waitingJobs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + }); - res.status(200).json({ message: "Check initiated" }); + app.get("/serverHealthCheck/notify", async (req, res) => { + if (process.env.SLACK_WEBHOOK_URL) { + const treshold = 1; // The treshold value for the active jobs + const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds - const checkWaitingJobs = async () => { - try { - let waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - setTimeout(async () => { - // Re-check the waiting jobs count after the timeout - waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; - const message = { - text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, - }; + const getWaitingJobsCount = async () => { + const webScraperQueue = getWebScraperQueue(); + const [waitingJobsCount] = await Promise.all([ + webScraperQueue.getWaitingCount(), + ]); - const response = await fetch(slackWebhookUrl, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify(message), - }) - - if (!response.ok) { - console.error('Failed to send Slack notification') + return waitingJobsCount; + }; + + res.status(200).json({ message: "Check initiated" }); + + const checkWaitingJobs = async () => { + try { + let waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + setTimeout(async () => { + // Re-check the waiting jobs count after the timeout + waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; + const message = { + text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ + timeout / 60000 + } minute(s).`, + }; + + const response = await fetch(slackWebhookUrl, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify(message), + }); + + if (!response.ok) { + console.error("Failed to send Slack notification"); + } } - } - }, timeout); + }, timeout); + } + } catch (error) { + console.error(error); } + }; + + checkWaitingJobs(); + } + }); + + app.get( + `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, + async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const completedJobs = await webScraperQueue.getJobs(["completed"]); + const before24hJobs = completedJobs.filter( + (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + ); + const jobIds = before24hJobs.map((job) => job.id) as string[]; + let count = 0; + for (const jobId of jobIds) { + try { + await webScraperQueue.removeJobs(jobId); + count++; + } catch (jobError) { + console.error(`Failed to remove job with ID ${jobId}:`, jobError); + } + } + res.status(200).send(`Removed ${count} completed jobs.`); } catch (error) { - console.error(error); + console.error("Failed to clean last 24h complete jobs:", error); + res.status(500).send("Failed to clean jobs"); } - }; + } + ); - checkWaitingJobs(); - } -}); + app.get("/is-production", (req, res) => { + res.send({ isProduction: global.isProduction }); + }); - -app.get("/is-production", (req, res) => { - res.send({ isProduction: global.isProduction }); -}); - - -// /workers health check, cant act as load balancer, just has to be a pre deploy thing \ No newline at end of file + console.log(`Worker ${process.pid} started`); +} diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 5511623..92170c1 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,8 @@ export type PageOptions = { waitFor?: number; screenshot?: boolean; headers?: Record; + replaceAllPathsWithAbsolutePaths?: boolean; + removeTags?: string | string[]; }; export type ExtractorOptions = { @@ -35,20 +37,24 @@ export type SearchOptions = { location?: string; }; +export type CrawlerOptions = { + returnOnlyUrls?: boolean; + includes?: string[]; + excludes?: string[]; + maxCrawledLinks?: number; + maxDepth?: number; + limit?: number; + generateImgAltText?: boolean; + replaceAllPathsWithAbsolutePaths?: boolean; + ignoreSitemap?: boolean; + mode?: "default" | "fast"; // have a mode of some sort + allowBackwardCrawling?: boolean; +} + export type WebScraperOptions = { urls: string[]; mode: "single_urls" | "sitemap" | "crawl"; - crawlerOptions?: { - returnOnlyUrls?: boolean; - includes?: string[]; - excludes?: string[]; - maxCrawledLinks?: number; - maxDepth?: number; - limit?: number; - generateImgAltText?: boolean; - replaceAllPathsWithAbsolutePaths?: boolean; - mode?: "default" | "fast"; // have a mode of some sort - }; + crawlerOptions?: CrawlerOptions; pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 9340aa8..7720991 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio"; import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; -import { Progress } from "../../lib/entities"; +import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; @@ -20,6 +20,7 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; + private allowBackwardCrawling: boolean; constructor({ initialUrl, @@ -29,6 +30,7 @@ export class WebCrawler { limit = 10000, generateImgAltText = false, maxCrawledDepth = 10, + allowBackwardCrawling = false }: { initialUrl: string; includes?: string[]; @@ -37,6 +39,7 @@ export class WebCrawler { limit?: number; generateImgAltText?: boolean; maxCrawledDepth?: number; + allowBackwardCrawling?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -49,6 +52,7 @@ export class WebCrawler { this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; + this.allowBackwardCrawling = allowBackwardCrawling ?? false; } private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { @@ -90,10 +94,16 @@ export class WebCrawler { const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); // Ensure the protocol and hostname match, and the path starts with the initial URL's path - if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + if (linkHostname !== initialHostname) { return false; } + if (!this.allowBackwardCrawling) { + if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + return false; + } + } + const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; // Check if the link is disallowed by robots.txt if (!isAllowed) { @@ -108,6 +118,8 @@ export class WebCrawler { public async start( inProgress?: (progress: Progress) => void, + pageOptions?: PageOptions, + crawlerOptions?: CrawlerOptions, concurrencyLimit: number = 5, limit: number = 10000, maxDepth: number = 10 @@ -122,17 +134,21 @@ export class WebCrawler { } - const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - return filteredLinks.map(link => ({ url: link, html: "" })); + if(!crawlerOptions?.ignoreSitemap){ + const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); + if (sitemapLinks.length > 0) { + let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + return filteredLinks.map(link => ({ url: link, html: "" })); + } } const urls = await this.crawlUrls( [this.initialUrl], + pageOptions, concurrencyLimit, inProgress ); + if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -140,14 +156,15 @@ export class WebCrawler { return [{ url: this.initialUrl, html: "" }]; } - // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); + return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } private async crawlUrls( urls: string[], + pageOptions: PageOptions, concurrencyLimit: number, inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { @@ -158,7 +175,7 @@ export class WebCrawler { } return; } - const newUrls = await this.crawl(task); + const newUrls = await this.crawl(task, pageOptions); // add the initial url if not already added // if (this.visited.size === 1) { // let normalizedInitial = this.initialUrl; @@ -188,7 +205,7 @@ export class WebCrawler { currentDocumentUrl: task, }); } - await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress); + await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } @@ -207,46 +224,42 @@ export class WebCrawler { return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } - async crawl(url: string): Promise<{url: string, html: string}[]> { - if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ + async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> { + const normalizedUrl = this.normalizeCrawlUrl(url); + if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) { return []; } - this.visited.add(url); - + this.visited.add(normalizedUrl); if (!url.startsWith("http")) { url = "https://" + url; - } if (url.endsWith("/")) { url = url.slice(0, -1); - } - + if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { return []; } try { - let content : string = ""; + let content: string = ""; // If it is the first link, fetch with single url if (this.visited.size === 1) { - const page = await scrapSingleUrl(url, {includeHtml: true}); - content = page.html ?? "" + const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); + content = page.html ?? ""; } else { const response = await axios.get(url); content = response.data ?? ""; } const $ = load(content); - let links: {url: string, html: string}[] = []; + let links: { url: string, html: string }[] = []; // Add the initial URL to the list of links - if(this.visited.size === 1) - { - links.push({url, html: content}); + if (this.visited.size === 1) { + links.push({ url, html: content }); } - $("a").each((_, element) => { const href = $(element).attr("href"); if (href) { @@ -254,32 +267,43 @@ export class WebCrawler { if (!href.startsWith("http")) { fullUrl = new URL(href, this.baseUrl).toString(); } - const url = new URL(fullUrl); - const path = url.pathname; + const urlObj = new URL(fullUrl); + const path = urlObj.pathname; if ( this.isInternalLink(fullUrl) && this.matchesPattern(fullUrl) && this.noSections(fullUrl) && - this.matchesIncludes(path) && + // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards + // this.matchesIncludes(path) && !this.matchesExcludes(path) && this.robots.isAllowed(fullUrl, "FireCrawlAgent") ) { - links.push({url: fullUrl, html: content}); + links.push({ url: fullUrl, html: content }); } } }); - if(this.visited.size === 1){ + if (this.visited.size === 1) { return links; } // Create a new list to return to avoid modifying the visited list - return links.filter((link) => !this.visited.has(link.url)); + return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url))); } catch (error) { return []; } } + private normalizeCrawlUrl(url: string): string { + try{ + const urlObj = new URL(url); + urlObj.searchParams.sort(); // Sort query parameters to normalize + return urlObj.toString(); + } catch (error) { + return url; + } + } + private matchesIncludes(url: string): boolean { if (this.includes.length === 0 || this.includes[0] == "") return true; return this.includes.some((pattern) => new RegExp(pattern).test(url)); @@ -388,7 +412,6 @@ export class WebCrawler { // Normalize and check if the URL is present in any of the sitemaps const normalizedUrl = normalizeUrl(url); - const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e3a3cc6..1a6ffd0 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -31,12 +31,14 @@ export class WebScraperDataProvider { private limit: number = 10000; private concurrentRequests: number = 20; private generateImgAltText: boolean = false; + private ignoreSitemap: boolean = false; private pageOptions?: PageOptions; private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; private crawlerMode: string = "default"; + private allowBackwardCrawling: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -169,10 +171,15 @@ export class WebScraperDataProvider { maxCrawledDepth: this.maxCrawledDepth, limit: this.limit, generateImgAltText: this.generateImgAltText, + allowBackwardCrawling: this.allowBackwardCrawling, }); let links = await crawler.start( inProgress, + this.pageOptions, + { + ignoreSitemap: this.ignoreSitemap, + }, 5, this.limit, this.maxCrawledDepth @@ -296,9 +303,10 @@ export class WebScraperDataProvider { } private applyPathReplacements(documents: Document[]): Document[] { - return this.replaceAllPathsWithAbsolutePaths - ? replacePathsWithAbsolutePaths(documents) - : replaceImgPathsWithAbsolutePaths(documents); + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } + return replaceImgPathsWithAbsolutePaths(documents); } private async applyImgAltText(documents: Document[]): Promise { @@ -467,12 +475,19 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + this.pageOptions = options.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + replaceAllPathsWithAbsolutePaths: false, + removeTags: [] + }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; + this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; + this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; // make sure all urls start with https:// this.urls = this.urls.map((url) => { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 689d5e7..a16f6f0 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -8,6 +8,7 @@ import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; +import axios from "axios"; dotenv.config(); @@ -19,6 +20,8 @@ const baseScrapers = [ "fetch", ] as const; +const universalTimeout = 15000; + export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -59,21 +62,24 @@ export async function scrapWithFireEngine( `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` ); - const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ + const response = await axios.post( + process.env.FIRE_ENGINE_BETA_URL + "/scrape", + { url: url, wait: waitParam, screenshot: screenshotParam, headers: headers, - pageOptions: pageOptions - }), - }); + pageOptions: pageOptions, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam + } + ); - if (!response.ok) { + if (response.status !== 200) { console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); @@ -84,13 +90,17 @@ export async function scrapWithFireEngine( if (contentType && contentType.includes("application/pdf")) { return { html: await fetchAndProcessPdf(url), screenshot: "" }; } else { - const data = await response.json(); + const data = response.data; const html = data.content; const screenshot = data.screenshot; return { html: html ?? "", screenshot: screenshot ?? "" }; } } catch (error) { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + if (error.code === 'ECONNABORTED') { + console.log(`[Fire-Engine] Request timed out for ${url}`); + } else { + console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + } return { html: "", screenshot: "" }; } } @@ -98,7 +108,7 @@ export async function scrapWithFireEngine( export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", - timeout: number = 15000 + timeout: number = universalTimeout ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); @@ -141,15 +151,19 @@ export async function scrapWithPlaywright( // If the user has passed a wait parameter in the request, use that const waitParam = reqParams["params"]?.wait ?? waitFor; - const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { - method: "POST", + const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, { + url: url, + wait_after_load: waitParam, + headers: headers, + }, { headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url, wait_after_load: waitParam, headers: headers }), + timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time + transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically }); - if (!response.ok) { + if (response.status !== 200) { console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); @@ -160,7 +174,7 @@ export async function scrapWithPlaywright( if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { - const textData = await response.text(); + const textData = response.data; try { const data = JSON.parse(textData); const html = data.content; @@ -171,17 +185,28 @@ export async function scrapWithPlaywright( } } } catch (error) { - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + if (error.code === 'ECONNABORTED') { + console.log(`[Playwright] Request timed out for ${url}`); + } else { + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + } return ""; } } export async function scrapWithFetch(url: string): Promise { try { - const response = await fetch(url); - if (!response.ok) { + const response = await axios.get(url, { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout, + transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically + }); + + if (response.status !== 200) { console.error( - `[Fetch] Error fetching url: ${url} with status: ${response.status}` + `[Axios] Error fetching url: ${url} with status: ${response.status}` ); return ""; } @@ -190,11 +215,15 @@ export async function scrapWithFetch(url: string): Promise { if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { - const text = await response.text(); + const text = response.data; return text; } } catch (error) { - console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`); + if (error.code === 'ECONNABORTED') { + console.log(`[Axios] Request timed out for ${url}`); + } else { + console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + } return ""; } } @@ -275,6 +304,19 @@ export async function scrapSingleUrl( const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); + + if (pageOptions.removeTags) { + if (typeof pageOptions.removeTags === 'string') { + pageOptions.removeTags.split(',').forEach((tag) => { + soup(tag.trim()).remove(); + }); + } else if (Array.isArray(pageOptions.removeTags)) { + pageOptions.removeTags.forEach((tag) => { + soup(tag).remove(); + }); + } + } + if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content excludeNonMainTags.forEach((tag) => { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 0ac4338..c6dbf11 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -12,6 +12,7 @@ export async function getLinksFromSitemap( content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); + return allUrls; } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index aae567c..e201926 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,14 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' + content: 'This is a [link](/path/to/resource).', + markdown: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is a [link](https://example.com/path/to/resource).', + markdown: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +23,8 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is an [external link](https://external.com/path).', + markdown: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -31,7 +34,8 @@ describe('replacePaths', () => { it('should not alter data URLs for images', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' + content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +45,14 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).', + markdown: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).', + markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -56,12 +62,14 @@ describe('replacePaths', () => { it('should correctly handle a mix of absolute and relative paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -74,12 +82,14 @@ describe('replacePaths', () => { it('should replace relative image paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](/path/to/image.jpg).' + content: 'Here is an image: ![alt text](/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](/path/to/image.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' + content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -89,7 +99,8 @@ describe('replacePaths', () => { it('should not alter data:image URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).' + content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -99,12 +110,14 @@ describe('replacePaths', () => { it('should handle multiple images with a mix of data and relative URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' + content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).', + markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' + content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).', + markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index d652611..788916c 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] ) || []; paths.forEach((path: string) => { - const isImage = path.startsWith("!"); + try { + const isImage = path.startsWith("!"); let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); let url = matchedUrl[1]; @@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] } const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; - if (isImage) { - document.content = document.content.replace( - path, - `${markdownLinkOrImageText}(${url})` - ); - } else { + // Image is handled afterwards + if (!isImage) { document.content = document.content.replace( path, `${markdownLinkOrImageText}(${url})` ); + } + } catch (error) { + } }); + document.markdown = document.content; }); return documents; @@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen if (!imageUrl.startsWith("http")) { if (imageUrl.startsWith("/")) { imageUrl = imageUrl.substring(1); + imageUrl = new URL(imageUrl, baseUrl).toString(); + } else { + imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString(); } - imageUrl = new URL(imageUrl, baseUrl).toString(); } } @@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen `![${altText}](${imageUrl})` ); }); + document.markdown = document.content; }); return documents; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 6772c57..a42b3e8 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -38,7 +38,7 @@ getWebScraperQueue().process( error: message /* etc... */, }; - await callWebhook(job.data.team_id, data); + await callWebhook(job.data.team_id, job.id as string, data); await logJob({ success: success, @@ -78,7 +78,7 @@ getWebScraperQueue().process( error: "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, }; - await callWebhook(job.data.team_id, data); + await callWebhook(job.data.team_id, job.id as string, data); await logJob({ success: false, message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index f2cedd1..491eeb1 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -1,8 +1,35 @@ -import Redis from 'ioredis'; +import Redis from "ioredis"; // Initialize Redis client const redis = new Redis(process.env.REDIS_URL); +// Listen to 'error' events to the Redis connection +redis.on("error", (error) => { + try { + if (error.message === "ECONNRESET") { + console.log("Connection to Redis Session Store timed out."); + } else if (error.message === "ECONNREFUSED") { + console.log("Connection to Redis Session Store refused!"); + } else console.log(error); + } catch (error) {} +}); + +// Listen to 'reconnecting' event to Redis +redis.on("reconnecting", (err) => { + try { + if (redis.status === "reconnecting") + console.log("Reconnecting to Redis Session Store..."); + else console.log("Error reconnecting to Redis Session Store."); + } catch (error) {} +}); + +// Listen to the 'connect' event to Redis +redis.on("connect", (err) => { + try { + if (!err) console.log("Connected to Redis Session Store!"); + } catch (error) {} +}); + /** * Set a value in Redis with an optional expiration time. * @param {string} key The key under which to store the value. @@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL); */ const setValue = async (key: string, value: string, expire?: number) => { if (expire) { - await redis.set(key, value, 'EX', expire); + await redis.set(key, value, "EX", expire); } else { await redis.set(key, value); } diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index eca7d09..fc5962b 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,17 +1,19 @@ import { supabase_service } from "./supabase"; -export const callWebhook = async (teamId: string, data: any) => { +export const callWebhook = async (teamId: string, jobId: string,data: any) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; let webhookUrl = selfHostedUrl; - if (!selfHostedUrl) { + // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set + // and the USE_DB_AUTHENTICATION environment variable is set to true + if (!selfHostedUrl && useDbAuthentication) { const { data: webhooksData, error } = await supabase_service .from("webhooks") .select("url") .eq("team_id", teamId) .limit(1); - if (error) { console.error( `Error fetching webhook URL for team ID: ${teamId}`, @@ -45,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => { }, body: JSON.stringify({ success: data.success, + jobId: jobId, data: dataToSend, error: data.error || undefined, }), diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index ecb017f..2fe16ba 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -1,3 +1,57 @@ +""" +This is the Firecrawl package. + +This package provides a Python SDK for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. + +For more information visit https://github.com/firecrawl/ +""" + +import logging +import os + from .firecrawl import FirecrawlApp -__version__ = "0.0.14" +__version__ = "0.0.15" + +# Define the logger for the Firecrawl project +logger: logging.Logger = logging.getLogger("firecrawl") + + +def _basic_config() -> None: + """Set up basic configuration for logging with a specific format and date format.""" + try: + logging.basicConfig( + format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + except Exception as e: + logger.error("Failed to configure logging: %s", e) + + +def setup_logging() -> None: + """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable.""" + env = os.environ.get( + "FIRECRAWL_LOGGING_LEVEL", "INFO" + ).upper() # Default to 'INFO' level + _basic_config() + + if env == "DEBUG": + logger.setLevel(logging.DEBUG) + elif env == "INFO": + logger.setLevel(logging.INFO) + elif env == "WARNING": + logger.setLevel(logging.WARNING) + elif env == "ERROR": + logger.setLevel(logging.ERROR) + elif env == "CRITICAL": + logger.setLevel(logging.CRITICAL) + else: + logger.setLevel(logging.INFO) + logger.warning("Unknown logging level: %s, defaulting to INFO", env) + + +# Initialize logging configuration when the module is imported +setup_logging() +logger.debug("Debugging logger setup") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 0000000..5ba1f13 Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index fb12af4..7ec0d33 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes. Classes: - FirecrawlApp: Main class for interacting with the Firecrawl API. """ - +import logging import os import time from typing import Any, Dict, Optional import requests +logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: """ @@ -28,8 +29,14 @@ class FirecrawlApp: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: + logger.warning("No API key provided") raise ValueError('No API key provided') + else: + logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + if self.api_url != 'https://api.firecrawl.dev': + logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ diff --git a/examples/k8n/README.md b/examples/kubernetes-cluster-install/README.md similarity index 100% rename from examples/k8n/README.md rename to examples/kubernetes-cluster-install/README.md diff --git a/examples/k8n/api.yaml b/examples/kubernetes-cluster-install/api.yaml similarity index 100% rename from examples/k8n/api.yaml rename to examples/kubernetes-cluster-install/api.yaml diff --git a/examples/k8n/configmap.yaml b/examples/kubernetes-cluster-install/configmap.yaml similarity index 100% rename from examples/k8n/configmap.yaml rename to examples/kubernetes-cluster-install/configmap.yaml diff --git a/examples/k8n/playwright-service.yaml b/examples/kubernetes-cluster-install/playwright-service.yaml similarity index 100% rename from examples/k8n/playwright-service.yaml rename to examples/kubernetes-cluster-install/playwright-service.yaml diff --git a/examples/k8n/redis.yaml b/examples/kubernetes-cluster-install/redis.yaml similarity index 100% rename from examples/k8n/redis.yaml rename to examples/kubernetes-cluster-install/redis.yaml diff --git a/examples/k8n/secret.yaml b/examples/kubernetes-cluster-install/secret.yaml similarity index 100% rename from examples/k8n/secret.yaml rename to examples/kubernetes-cluster-install/secret.yaml diff --git a/examples/k8n/worker.yaml b/examples/kubernetes-cluster-install/worker.yaml similarity index 100% rename from examples/k8n/worker.yaml rename to examples/kubernetes-cluster-install/worker.yaml diff --git a/examples/roastmywebsite/.eslintrc.json b/examples/roastmywebsite-example-app/.eslintrc.json similarity index 100% rename from examples/roastmywebsite/.eslintrc.json rename to examples/roastmywebsite-example-app/.eslintrc.json diff --git a/examples/roastmywebsite/.gitignore b/examples/roastmywebsite-example-app/.gitignore similarity index 100% rename from examples/roastmywebsite/.gitignore rename to examples/roastmywebsite-example-app/.gitignore diff --git a/examples/roastmywebsite/README.md b/examples/roastmywebsite-example-app/README.md similarity index 100% rename from examples/roastmywebsite/README.md rename to examples/roastmywebsite-example-app/README.md diff --git a/examples/roastmywebsite/components.json b/examples/roastmywebsite-example-app/components.json similarity index 100% rename from examples/roastmywebsite/components.json rename to examples/roastmywebsite-example-app/components.json diff --git a/examples/roastmywebsite/next.config.mjs b/examples/roastmywebsite-example-app/next.config.mjs similarity index 100% rename from examples/roastmywebsite/next.config.mjs rename to examples/roastmywebsite-example-app/next.config.mjs diff --git a/examples/roastmywebsite/package-lock.json b/examples/roastmywebsite-example-app/package-lock.json similarity index 100% rename from examples/roastmywebsite/package-lock.json rename to examples/roastmywebsite-example-app/package-lock.json diff --git a/examples/roastmywebsite/package.json b/examples/roastmywebsite-example-app/package.json similarity index 100% rename from examples/roastmywebsite/package.json rename to examples/roastmywebsite-example-app/package.json diff --git a/examples/roastmywebsite/postcss.config.mjs b/examples/roastmywebsite-example-app/postcss.config.mjs similarity index 100% rename from examples/roastmywebsite/postcss.config.mjs rename to examples/roastmywebsite-example-app/postcss.config.mjs diff --git a/examples/roastmywebsite/public/android-chrome-192x192.png b/examples/roastmywebsite-example-app/public/android-chrome-192x192.png similarity index 100% rename from examples/roastmywebsite/public/android-chrome-192x192.png rename to examples/roastmywebsite-example-app/public/android-chrome-192x192.png diff --git a/examples/roastmywebsite/public/android-chrome-512x512.png b/examples/roastmywebsite-example-app/public/android-chrome-512x512.png similarity index 100% rename from examples/roastmywebsite/public/android-chrome-512x512.png rename to examples/roastmywebsite-example-app/public/android-chrome-512x512.png diff --git a/examples/roastmywebsite/public/apple-touch-icon.png b/examples/roastmywebsite-example-app/public/apple-touch-icon.png similarity index 100% rename from examples/roastmywebsite/public/apple-touch-icon.png rename to examples/roastmywebsite-example-app/public/apple-touch-icon.png diff --git a/examples/roastmywebsite/public/bgd.png b/examples/roastmywebsite-example-app/public/bgd.png similarity index 100% rename from examples/roastmywebsite/public/bgd.png rename to examples/roastmywebsite-example-app/public/bgd.png diff --git a/examples/roastmywebsite/public/favicon-16x16.png b/examples/roastmywebsite-example-app/public/favicon-16x16.png similarity index 100% rename from examples/roastmywebsite/public/favicon-16x16.png rename to examples/roastmywebsite-example-app/public/favicon-16x16.png diff --git a/examples/roastmywebsite/public/favicon-32x32.png b/examples/roastmywebsite-example-app/public/favicon-32x32.png similarity index 100% rename from examples/roastmywebsite/public/favicon-32x32.png rename to examples/roastmywebsite-example-app/public/favicon-32x32.png diff --git a/examples/roastmywebsite/public/favicon.ico b/examples/roastmywebsite-example-app/public/favicon.ico similarity index 100% rename from examples/roastmywebsite/public/favicon.ico rename to examples/roastmywebsite-example-app/public/favicon.ico diff --git a/examples/roastmywebsite/public/next.svg b/examples/roastmywebsite-example-app/public/next.svg similarity index 100% rename from examples/roastmywebsite/public/next.svg rename to examples/roastmywebsite-example-app/public/next.svg diff --git a/examples/roastmywebsite/public/og.png b/examples/roastmywebsite-example-app/public/og.png similarity index 100% rename from examples/roastmywebsite/public/og.png rename to examples/roastmywebsite-example-app/public/og.png diff --git a/examples/roastmywebsite/public/site.webmanifest b/examples/roastmywebsite-example-app/public/site.webmanifest similarity index 100% rename from examples/roastmywebsite/public/site.webmanifest rename to examples/roastmywebsite-example-app/public/site.webmanifest diff --git a/examples/roastmywebsite/public/vercel.svg b/examples/roastmywebsite-example-app/public/vercel.svg similarity index 100% rename from examples/roastmywebsite/public/vercel.svg rename to examples/roastmywebsite-example-app/public/vercel.svg diff --git a/examples/roastmywebsite/src/app/favicon.ico b/examples/roastmywebsite-example-app/src/app/favicon.ico similarity index 100% rename from examples/roastmywebsite/src/app/favicon.ico rename to examples/roastmywebsite-example-app/src/app/favicon.ico diff --git a/examples/roastmywebsite/src/app/globals.css b/examples/roastmywebsite-example-app/src/app/globals.css similarity index 100% rename from examples/roastmywebsite/src/app/globals.css rename to examples/roastmywebsite-example-app/src/app/globals.css diff --git a/examples/roastmywebsite/src/app/hooks/useGithubStars.ts b/examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts similarity index 100% rename from examples/roastmywebsite/src/app/hooks/useGithubStars.ts rename to examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts diff --git a/examples/roastmywebsite/src/app/layout.tsx b/examples/roastmywebsite-example-app/src/app/layout.tsx similarity index 100% rename from examples/roastmywebsite/src/app/layout.tsx rename to examples/roastmywebsite-example-app/src/app/layout.tsx diff --git a/examples/roastmywebsite/src/app/page.tsx b/examples/roastmywebsite-example-app/src/app/page.tsx similarity index 100% rename from examples/roastmywebsite/src/app/page.tsx rename to examples/roastmywebsite-example-app/src/app/page.tsx diff --git a/examples/roastmywebsite/src/components/github-button.tsx b/examples/roastmywebsite-example-app/src/components/github-button.tsx similarity index 100% rename from examples/roastmywebsite/src/components/github-button.tsx rename to examples/roastmywebsite-example-app/src/components/github-button.tsx diff --git a/examples/roastmywebsite/src/components/main.tsx b/examples/roastmywebsite-example-app/src/components/main.tsx similarity index 100% rename from examples/roastmywebsite/src/components/main.tsx rename to examples/roastmywebsite-example-app/src/components/main.tsx diff --git a/examples/roastmywebsite/src/components/ui/button.tsx b/examples/roastmywebsite-example-app/src/components/ui/button.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/button.tsx rename to examples/roastmywebsite-example-app/src/components/ui/button.tsx diff --git a/examples/roastmywebsite/src/components/ui/dialog.tsx b/examples/roastmywebsite-example-app/src/components/ui/dialog.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/dialog.tsx rename to examples/roastmywebsite-example-app/src/components/ui/dialog.tsx diff --git a/examples/roastmywebsite/src/components/ui/dropdown-menu.tsx b/examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/dropdown-menu.tsx rename to examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx diff --git a/examples/roastmywebsite/src/components/ui/input.tsx b/examples/roastmywebsite-example-app/src/components/ui/input.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/input.tsx rename to examples/roastmywebsite-example-app/src/components/ui/input.tsx diff --git a/examples/roastmywebsite/src/components/ui/select.tsx b/examples/roastmywebsite-example-app/src/components/ui/select.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/select.tsx rename to examples/roastmywebsite-example-app/src/components/ui/select.tsx diff --git a/examples/roastmywebsite/src/components/ui/sonner.tsx b/examples/roastmywebsite-example-app/src/components/ui/sonner.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/sonner.tsx rename to examples/roastmywebsite-example-app/src/components/ui/sonner.tsx diff --git a/examples/roastmywebsite/src/components/ui/switch.tsx b/examples/roastmywebsite-example-app/src/components/ui/switch.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/switch.tsx rename to examples/roastmywebsite-example-app/src/components/ui/switch.tsx diff --git a/examples/roastmywebsite/src/components/ui/textarea.tsx b/examples/roastmywebsite-example-app/src/components/ui/textarea.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/textarea.tsx rename to examples/roastmywebsite-example-app/src/components/ui/textarea.tsx diff --git a/examples/roastmywebsite/src/lib/LLM/llm.ts b/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts similarity index 98% rename from examples/roastmywebsite/src/lib/LLM/llm.ts rename to examples/roastmywebsite-example-app/src/lib/LLM/llm.ts index 39dcf10..1d290a3 100644 --- a/examples/roastmywebsite/src/lib/LLM/llm.ts +++ b/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts @@ -1,4 +1,4 @@ -import OpenAI from "openai"; +import OpenAI from "openai/index.mjs"; import { encoding_for_model } from "@dqbd/tiktoken"; /** diff --git a/examples/roastmywebsite/src/lib/LLM/testing_constants.ts b/examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts similarity index 100% rename from examples/roastmywebsite/src/lib/LLM/testing_constants.ts rename to examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts diff --git a/examples/roastmywebsite/src/lib/utils.ts b/examples/roastmywebsite-example-app/src/lib/utils.ts similarity index 100% rename from examples/roastmywebsite/src/lib/utils.ts rename to examples/roastmywebsite-example-app/src/lib/utils.ts diff --git a/examples/roastmywebsite/src/pages/api/roastWebsite.ts b/examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts similarity index 100% rename from examples/roastmywebsite/src/pages/api/roastWebsite.ts rename to examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts diff --git a/examples/roastmywebsite/tailwind.config.ts b/examples/roastmywebsite-example-app/tailwind.config.ts similarity index 100% rename from examples/roastmywebsite/tailwind.config.ts rename to examples/roastmywebsite-example-app/tailwind.config.ts diff --git a/examples/roastmywebsite/tsconfig.json b/examples/roastmywebsite-example-app/tsconfig.json similarity index 100% rename from examples/roastmywebsite/tsconfig.json rename to examples/roastmywebsite-example-app/tsconfig.json diff --git a/examples/contradiction-testing-using-llms.mdx b/examples/web-data-contradiction-testing-using-llms.mdx similarity index 100% rename from examples/contradiction-testing-using-llms.mdx rename to examples/web-data-contradiction-testing-using-llms.mdx diff --git a/examples/data-extraction-using-llms.mdx b/examples/web-data-extraction-using-llms.mdx similarity index 100% rename from examples/data-extraction-using-llms.mdx rename to examples/web-data-extraction-using-llms.mdx diff --git a/examples/rag-llama3.mdx b/examples/web-data-rag--with-llama3.mdx similarity index 100% rename from examples/rag-llama3.mdx rename to examples/web-data-rag--with-llama3.mdx