Merge branch 'main' into feat/issue-205
This commit is contained in:
commit
3e2e76311c
@ -54,7 +54,7 @@ kill_timeout = '5s'
|
|||||||
soft_limit = 12
|
soft_limit = 12
|
||||||
|
|
||||||
[[vm]]
|
[[vm]]
|
||||||
size = 'performance-8x'
|
size = 'performance-4x'
|
||||||
processes = ['app']
|
processes = ['app']
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,6 +61,13 @@
|
|||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
"default": 0
|
"default": 0
|
||||||
},
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
"headers": {
|
"headers": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||||
@ -194,6 +201,11 @@
|
|||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Maximum number of pages to crawl",
|
"description": "Maximum number of pages to crawl",
|
||||||
"default": 10000
|
"default": 10000
|
||||||
|
},
|
||||||
|
"allowBackwardCrawling": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -219,6 +231,13 @@
|
|||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||||
},
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
"replaceAllPathsWithAbsolutePaths": {
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Replace all relative paths with absolute paths for images and links",
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { app } from "../../index";
|
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { app } from "../../index";
|
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
|
|
||||||
describe("POST /v0/scrape", () => {
|
describe("POST /v0/scrape", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response = await request(app).post("/v0/scrape");
|
const response = await request(TEST_URL).post("/v0/scrape");
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -151,6 +150,40 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||||
|
const responseWithoutRemoveTags = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/" });
|
||||||
|
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||||
|
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("content");
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
|
expect(response.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||||
|
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||||
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||||
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
||||||
// const startTime = Date.now();
|
// const startTime = Date.now();
|
||||||
|
@ -55,10 +55,14 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
|
|
||||||
|
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||||
|
allowBackwardCrawling: false
|
||||||
|
};
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
|
removeTags: [],
|
||||||
parsePDF: true
|
parsePDF: true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
|
@ -85,6 +85,7 @@ export async function searchHelper(
|
|||||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||||
includeHtml: pageOptions?.includeHtml ?? false,
|
includeHtml: pageOptions?.includeHtml ?? false,
|
||||||
|
removeTags: pageOptions?.removeTags ?? [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
fetchPageContent: true,
|
fetchPageContent: true,
|
||||||
|
removeTags: [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
@ -5,190 +5,215 @@ import "dotenv/config";
|
|||||||
import { getWebScraperQueue } from "./services/queue-service";
|
import { getWebScraperQueue } from "./services/queue-service";
|
||||||
import { redisClient } from "./services/rate-limiter";
|
import { redisClient } from "./services/rate-limiter";
|
||||||
import { v0Router } from "./routes/v0";
|
import { v0Router } from "./routes/v0";
|
||||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||||
|
import cluster from "cluster";
|
||||||
|
import os from "os";
|
||||||
|
|
||||||
const { createBullBoard } = require("@bull-board/api");
|
const { createBullBoard } = require("@bull-board/api");
|
||||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||||
const { ExpressAdapter } = require("@bull-board/express");
|
const { ExpressAdapter } = require("@bull-board/express");
|
||||||
|
|
||||||
export const app = express();
|
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||||
|
console.log(`Number of CPUs: ${numCPUs} available`);
|
||||||
|
|
||||||
global.isProduction = process.env.IS_PRODUCTION === "true";
|
if (cluster.isMaster) {
|
||||||
|
console.log(`Master ${process.pid} is running`);
|
||||||
|
|
||||||
app.use(bodyParser.urlencoded({ extended: true }));
|
// Fork workers.
|
||||||
app.use(bodyParser.json({ limit: "10mb" }));
|
for (let i = 0; i < numCPUs; i++) {
|
||||||
|
cluster.fork();
|
||||||
|
}
|
||||||
|
|
||||||
app.use(cors()); // Add this line to enable CORS
|
cluster.on("exit", (worker, code, signal) => {
|
||||||
|
console.log(`Worker ${worker.process.pid} exited`);
|
||||||
const serverAdapter = new ExpressAdapter();
|
console.log("Starting a new worker");
|
||||||
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
cluster.fork();
|
||||||
|
|
||||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
|
||||||
queues: [new BullAdapter(getWebScraperQueue())],
|
|
||||||
serverAdapter: serverAdapter,
|
|
||||||
});
|
|
||||||
|
|
||||||
app.use(
|
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
|
||||||
serverAdapter.getRouter()
|
|
||||||
);
|
|
||||||
|
|
||||||
app.get("/", (req, res) => {
|
|
||||||
res.send("SCRAPERS-JS: Hello, world! Fly.io");
|
|
||||||
});
|
|
||||||
|
|
||||||
//write a simple test function
|
|
||||||
app.get("/test", async (req, res) => {
|
|
||||||
res.send("Hello, world!");
|
|
||||||
});
|
|
||||||
|
|
||||||
// register router
|
|
||||||
app.use(v0Router);
|
|
||||||
|
|
||||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
|
||||||
const HOST = process.env.HOST ?? "localhost";
|
|
||||||
redisClient.connect();
|
|
||||||
|
|
||||||
// HyperDX OpenTelemetry
|
|
||||||
if(process.env.ENV === 'production') {
|
|
||||||
initSDK({ consoleCapture: true, additionalInstrumentations: []});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
export function startServer(port = DEFAULT_PORT) {
|
|
||||||
const server = app.listen(Number(port), HOST, () => {
|
|
||||||
console.log(`Server listening on port ${port}`);
|
|
||||||
console.log(
|
|
||||||
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
|
||||||
);
|
|
||||||
console.log("");
|
|
||||||
console.log("1. Make sure Redis is running on port 6379 by default");
|
|
||||||
console.log(
|
|
||||||
"2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
|
|
||||||
);
|
|
||||||
});
|
});
|
||||||
return server;
|
} else {
|
||||||
}
|
const app = express();
|
||||||
|
|
||||||
if (require.main === module) {
|
global.isProduction = process.env.IS_PRODUCTION === "true";
|
||||||
startServer();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use this as a "health check" that way we dont destroy the server
|
app.use(bodyParser.urlencoded({ extended: true }));
|
||||||
app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
|
app.use(bodyParser.json({ limit: "10mb" }));
|
||||||
try {
|
|
||||||
const webScraperQueue = getWebScraperQueue();
|
|
||||||
const [webScraperActive] = await Promise.all([
|
|
||||||
webScraperQueue.getActiveCount(),
|
|
||||||
]);
|
|
||||||
|
|
||||||
const noActiveJobs = webScraperActive === 0;
|
app.use(cors()); // Add this line to enable CORS
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
|
||||||
return res.status(noActiveJobs ? 200 : 500).json({
|
const serverAdapter = new ExpressAdapter();
|
||||||
webScraperActive,
|
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
||||||
noActiveJobs,
|
|
||||||
});
|
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||||
} catch (error) {
|
queues: [new BullAdapter(getWebScraperQueue())],
|
||||||
console.error(error);
|
serverAdapter: serverAdapter,
|
||||||
return res.status(500).json({ error: error.message });
|
});
|
||||||
|
|
||||||
|
app.use(
|
||||||
|
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||||
|
serverAdapter.getRouter()
|
||||||
|
);
|
||||||
|
|
||||||
|
app.get("/", (req, res) => {
|
||||||
|
res.send("SCRAPERS-JS: Hello, world! Fly.io");
|
||||||
|
});
|
||||||
|
|
||||||
|
//write a simple test function
|
||||||
|
app.get("/test", async (req, res) => {
|
||||||
|
res.send("Hello, world!");
|
||||||
|
});
|
||||||
|
|
||||||
|
// register router
|
||||||
|
app.use(v0Router);
|
||||||
|
|
||||||
|
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||||
|
const HOST = process.env.HOST ?? "localhost";
|
||||||
|
redisClient.connect();
|
||||||
|
|
||||||
|
// HyperDX OpenTelemetry
|
||||||
|
if (process.env.ENV === "production") {
|
||||||
|
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
|
||||||
}
|
}
|
||||||
});
|
|
||||||
|
|
||||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
function startServer(port = DEFAULT_PORT) {
|
||||||
try {
|
const server = app.listen(Number(port), HOST, () => {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
console.log(`Worker ${process.pid} listening on port ${port}`);
|
||||||
const [waitingJobs] = await Promise.all([
|
console.log(
|
||||||
webScraperQueue.getWaitingCount(),
|
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||||
]);
|
);
|
||||||
|
console.log("");
|
||||||
const noWaitingJobs = waitingJobs === 0;
|
console.log("1. Make sure Redis is running on port 6379 by default");
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
console.log(
|
||||||
return res.status(noWaitingJobs ? 200 : 500).json({
|
"2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
|
||||||
waitingJobs,
|
);
|
||||||
});
|
});
|
||||||
} catch (error) {
|
return server;
|
||||||
console.error(error);
|
|
||||||
return res.status(500).json({ error: error.message });
|
|
||||||
}
|
}
|
||||||
});
|
|
||||||
|
|
||||||
app.get('/serverHealthCheck/notify', async (req, res) => {
|
if (require.main === module) {
|
||||||
if (process.env.SLACK_WEBHOOK_URL) {
|
startServer();
|
||||||
const treshold = 1; // The treshold value for the active jobs
|
}
|
||||||
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
|
||||||
|
|
||||||
const getWaitingJobsCount = async () => {
|
// Use this as a "health check" that way we dont destroy the server
|
||||||
|
app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
|
||||||
|
try {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const webScraperQueue = getWebScraperQueue();
|
||||||
const [waitingJobsCount] = await Promise.all([
|
const [webScraperActive] = await Promise.all([
|
||||||
|
webScraperQueue.getActiveCount(),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const noActiveJobs = webScraperActive === 0;
|
||||||
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
|
return res.status(noActiveJobs ? 200 : 500).json({
|
||||||
|
webScraperActive,
|
||||||
|
noActiveJobs,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
return res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||||
|
try {
|
||||||
|
const webScraperQueue = getWebScraperQueue();
|
||||||
|
const [waitingJobs] = await Promise.all([
|
||||||
webScraperQueue.getWaitingCount(),
|
webScraperQueue.getWaitingCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return waitingJobsCount;
|
const noWaitingJobs = waitingJobs === 0;
|
||||||
};
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
|
return res.status(noWaitingJobs ? 200 : 500).json({
|
||||||
|
waitingJobs,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
return res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
res.status(200).json({ message: "Check initiated" });
|
app.get("/serverHealthCheck/notify", async (req, res) => {
|
||||||
|
if (process.env.SLACK_WEBHOOK_URL) {
|
||||||
|
const treshold = 1; // The treshold value for the active jobs
|
||||||
|
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
||||||
|
|
||||||
const checkWaitingJobs = async () => {
|
const getWaitingJobsCount = async () => {
|
||||||
try {
|
const webScraperQueue = getWebScraperQueue();
|
||||||
let waitingJobsCount = await getWaitingJobsCount();
|
const [waitingJobsCount] = await Promise.all([
|
||||||
if (waitingJobsCount >= treshold) {
|
webScraperQueue.getWaitingCount(),
|
||||||
setTimeout(async () => {
|
]);
|
||||||
// Re-check the waiting jobs count after the timeout
|
|
||||||
waitingJobsCount = await getWaitingJobsCount();
|
|
||||||
if (waitingJobsCount >= treshold) {
|
|
||||||
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
|
|
||||||
const message = {
|
|
||||||
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`,
|
|
||||||
};
|
|
||||||
|
|
||||||
const response = await fetch(slackWebhookUrl, {
|
return waitingJobsCount;
|
||||||
method: 'POST',
|
};
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
body: JSON.stringify(message),
|
|
||||||
})
|
|
||||||
|
|
||||||
if (!response.ok) {
|
res.status(200).json({ message: "Check initiated" });
|
||||||
console.error('Failed to send Slack notification')
|
|
||||||
|
const checkWaitingJobs = async () => {
|
||||||
|
try {
|
||||||
|
let waitingJobsCount = await getWaitingJobsCount();
|
||||||
|
if (waitingJobsCount >= treshold) {
|
||||||
|
setTimeout(async () => {
|
||||||
|
// Re-check the waiting jobs count after the timeout
|
||||||
|
waitingJobsCount = await getWaitingJobsCount();
|
||||||
|
if (waitingJobsCount >= treshold) {
|
||||||
|
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
|
||||||
|
const message = {
|
||||||
|
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
||||||
|
timeout / 60000
|
||||||
|
} minute(s).`,
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await fetch(slackWebhookUrl, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify(message),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
console.error("Failed to send Slack notification");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}, timeout);
|
||||||
}, timeout);
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
};
|
||||||
console.error(error);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
checkWaitingJobs();
|
checkWaitingJobs();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => {
|
app.get(
|
||||||
try {
|
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||||
const webScraperQueue = getWebScraperQueue();
|
async (req, res) => {
|
||||||
const completedJobs = await webScraperQueue.getJobs(['completed']);
|
|
||||||
const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000);
|
|
||||||
const jobIds = before24hJobs.map(job => job.id) as string[];
|
|
||||||
let count = 0;
|
|
||||||
for (const jobId of jobIds) {
|
|
||||||
try {
|
try {
|
||||||
await webScraperQueue.removeJobs(jobId);
|
const webScraperQueue = getWebScraperQueue();
|
||||||
count++;
|
const completedJobs = await webScraperQueue.getJobs(["completed"]);
|
||||||
} catch (jobError) {
|
const before24hJobs = completedJobs.filter(
|
||||||
console.error(`Failed to remove job with ID ${jobId}:`, jobError);
|
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||||
|
);
|
||||||
|
const jobIds = before24hJobs.map((job) => job.id) as string[];
|
||||||
|
let count = 0;
|
||||||
|
for (const jobId of jobIds) {
|
||||||
|
try {
|
||||||
|
await webScraperQueue.removeJobs(jobId);
|
||||||
|
count++;
|
||||||
|
} catch (jobError) {
|
||||||
|
console.error(`Failed to remove job with ID ${jobId}:`, jobError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res.status(200).send(`Removed ${count} completed jobs.`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Failed to clean last 24h complete jobs:", error);
|
||||||
|
res.status(500).send("Failed to clean jobs");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res.status(200).send(`Removed ${count} completed jobs.`);
|
);
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to clean last 24h complete jobs:', error);
|
|
||||||
res.status(500).send('Failed to clean jobs');
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
app.get("/is-production", (req, res) => {
|
app.get("/is-production", (req, res) => {
|
||||||
res.send({ isProduction: global.isProduction });
|
res.send({ isProduction: global.isProduction });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
console.log(`Worker ${process.pid} started`);
|
||||||
// /workers health check, cant act as load balancer, just has to be a pre deploy thing
|
}
|
||||||
|
@ -19,7 +19,8 @@ export type PageOptions = {
|
|||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
parsePDF?: boolean
|
parsePDF?: boolean;
|
||||||
|
removeTags?: string | string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -479,7 +479,8 @@ export class WebScraperDataProvider {
|
|||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
replaceAllPathsWithAbsolutePaths: false,
|
replaceAllPathsWithAbsolutePaths: false,
|
||||||
parsePDF: true
|
parsePDF: true,
|
||||||
|
removeTags: []
|
||||||
};
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
|
@ -309,6 +309,19 @@ export async function scrapSingleUrl(
|
|||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
soup("script, style, iframe, noscript, meta, head").remove();
|
soup("script, style, iframe, noscript, meta, head").remove();
|
||||||
|
|
||||||
|
if (pageOptions.removeTags) {
|
||||||
|
if (typeof pageOptions.removeTags === 'string') {
|
||||||
|
pageOptions.removeTags.split(',').forEach((tag) => {
|
||||||
|
soup(tag.trim()).remove();
|
||||||
|
});
|
||||||
|
} else if (Array.isArray(pageOptions.removeTags)) {
|
||||||
|
pageOptions.removeTags.forEach((tag) => {
|
||||||
|
soup(tag).remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (pageOptions.onlyMainContent) {
|
if (pageOptions.onlyMainContent) {
|
||||||
// remove any other tags that are not in the main content
|
// remove any other tags that are not in the main content
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
|
@ -38,7 +38,7 @@ getWebScraperQueue().process(
|
|||||||
error: message /* etc... */,
|
error: message /* etc... */,
|
||||||
};
|
};
|
||||||
|
|
||||||
await callWebhook(job.data.team_id, data);
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
|
|
||||||
await logJob({
|
await logJob({
|
||||||
success: success,
|
success: success,
|
||||||
@ -78,7 +78,7 @@ getWebScraperQueue().process(
|
|||||||
error:
|
error:
|
||||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||||
};
|
};
|
||||||
await callWebhook(job.data.team_id, data);
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
await logJob({
|
await logJob({
|
||||||
success: false,
|
success: false,
|
||||||
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
||||||
|
@ -1,8 +1,35 @@
|
|||||||
import Redis from 'ioredis';
|
import Redis from "ioredis";
|
||||||
|
|
||||||
// Initialize Redis client
|
// Initialize Redis client
|
||||||
const redis = new Redis(process.env.REDIS_URL);
|
const redis = new Redis(process.env.REDIS_URL);
|
||||||
|
|
||||||
|
// Listen to 'error' events to the Redis connection
|
||||||
|
redis.on("error", (error) => {
|
||||||
|
try {
|
||||||
|
if (error.message === "ECONNRESET") {
|
||||||
|
console.log("Connection to Redis Session Store timed out.");
|
||||||
|
} else if (error.message === "ECONNREFUSED") {
|
||||||
|
console.log("Connection to Redis Session Store refused!");
|
||||||
|
} else console.log(error);
|
||||||
|
} catch (error) {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Listen to 'reconnecting' event to Redis
|
||||||
|
redis.on("reconnecting", (err) => {
|
||||||
|
try {
|
||||||
|
if (redis.status === "reconnecting")
|
||||||
|
console.log("Reconnecting to Redis Session Store...");
|
||||||
|
else console.log("Error reconnecting to Redis Session Store.");
|
||||||
|
} catch (error) {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Listen to the 'connect' event to Redis
|
||||||
|
redis.on("connect", (err) => {
|
||||||
|
try {
|
||||||
|
if (!err) console.log("Connected to Redis Session Store!");
|
||||||
|
} catch (error) {}
|
||||||
|
});
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set a value in Redis with an optional expiration time.
|
* Set a value in Redis with an optional expiration time.
|
||||||
* @param {string} key The key under which to store the value.
|
* @param {string} key The key under which to store the value.
|
||||||
@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL);
|
|||||||
*/
|
*/
|
||||||
const setValue = async (key: string, value: string, expire?: number) => {
|
const setValue = async (key: string, value: string, expire?: number) => {
|
||||||
if (expire) {
|
if (expire) {
|
||||||
await redis.set(key, value, 'EX', expire);
|
await redis.set(key, value, "EX", expire);
|
||||||
} else {
|
} else {
|
||||||
await redis.set(key, value);
|
await redis.set(key, value);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { supabase_service } from "./supabase";
|
import { supabase_service } from "./supabase";
|
||||||
|
|
||||||
export const callWebhook = async (teamId: string, data: any) => {
|
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||||
try {
|
try {
|
||||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
|
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
@ -47,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => {
|
|||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
success: data.success,
|
success: data.success,
|
||||||
|
jobId: jobId,
|
||||||
data: dataToSend,
|
data: dataToSend,
|
||||||
error: data.error || undefined,
|
error: data.error || undefined,
|
||||||
}),
|
}),
|
||||||
|
@ -1,3 +1,57 @@
|
|||||||
|
"""
|
||||||
|
This is the Firecrawl package.
|
||||||
|
|
||||||
|
This package provides a Python SDK for interacting with the Firecrawl API.
|
||||||
|
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||||
|
and check the status of these jobs.
|
||||||
|
|
||||||
|
For more information visit https://github.com/firecrawl/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
from .firecrawl import FirecrawlApp
|
from .firecrawl import FirecrawlApp
|
||||||
|
|
||||||
__version__ = "0.0.14"
|
__version__ = "0.0.16"
|
||||||
|
|
||||||
|
# Define the logger for the Firecrawl project
|
||||||
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
|
|
||||||
|
def _basic_config() -> None:
|
||||||
|
"""Set up basic configuration for logging with a specific format and date format."""
|
||||||
|
try:
|
||||||
|
logging.basicConfig(
|
||||||
|
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to configure logging: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging() -> None:
|
||||||
|
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
||||||
|
env = os.environ.get(
|
||||||
|
"FIRECRAWL_LOGGING_LEVEL", "INFO"
|
||||||
|
).upper() # Default to 'INFO' level
|
||||||
|
_basic_config()
|
||||||
|
|
||||||
|
if env == "DEBUG":
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
elif env == "INFO":
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
elif env == "WARNING":
|
||||||
|
logger.setLevel(logging.WARNING)
|
||||||
|
elif env == "ERROR":
|
||||||
|
logger.setLevel(logging.ERROR)
|
||||||
|
elif env == "CRITICAL":
|
||||||
|
logger.setLevel(logging.CRITICAL)
|
||||||
|
else:
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize logging configuration when the module is imported
|
||||||
|
setup_logging()
|
||||||
|
logger.debug("Debugging logger setup")
|
||||||
|
Binary file not shown.
@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key():
|
|||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
invalid_app.scrape_url('https://firecrawl.dev')
|
invalid_app.scrape_url('https://firecrawl.dev')
|
||||||
assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
|
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
|
||||||
def test_blocklisted_url():
|
def test_blocklisted_url():
|
||||||
blocklisted_url = "https://facebook.com/fake-test"
|
blocklisted_url = "https://facebook.com/fake-test"
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
app.scrape_url(blocklisted_url)
|
app.scrape_url(blocklisted_url)
|
||||||
assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
|
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||||
|
|
||||||
def test_successful_response_with_valid_preview_token():
|
def test_successful_response_with_valid_preview_token():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||||
@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key():
|
|||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
invalid_app.crawl_url('https://firecrawl.dev')
|
invalid_app.crawl_url('https://firecrawl.dev')
|
||||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
|
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
|
||||||
def test_should_return_error_for_blocklisted_url():
|
def test_should_return_error_for_blocklisted_url():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
blocklisted_url = "https://twitter.com/fake-test"
|
blocklisted_url = "https://twitter.com/fake-test"
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
app.crawl_url(blocklisted_url)
|
app.crawl_url(blocklisted_url)
|
||||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
|
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||||
|
|
||||||
def test_crawl_url_wait_for_completion_e2e():
|
def test_crawl_url_wait_for_completion_e2e():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e():
|
|||||||
|
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||||
assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value)
|
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
||||||
|
|
||||||
def test_check_crawl_status_e2e():
|
def test_check_crawl_status_e2e():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
@ -141,7 +141,7 @@ def test_search_invalid_api_key():
|
|||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
invalid_app.search("test query")
|
invalid_app.search("test query")
|
||||||
assert "Failed to search. Status code: 401" in str(excinfo.value)
|
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
|
||||||
def test_llm_extraction():
|
def test_llm_extraction():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
|
@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
|
|||||||
Classes:
|
Classes:
|
||||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
"""
|
"""
|
||||||
@ -28,8 +29,15 @@ class FirecrawlApp:
|
|||||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||||
if self.api_key is None:
|
if self.api_key is None:
|
||||||
|
logger.warning("No API key provided")
|
||||||
raise ValueError('No API key provided')
|
raise ValueError('No API key provided')
|
||||||
|
else:
|
||||||
|
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
|
||||||
|
|
||||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||||
|
if self.api_url != 'https://api.firecrawl.dev':
|
||||||
|
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
|
||||||
|
|
||||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Scrape the specified URL using the Firecrawl API.
|
Scrape the specified URL using the Firecrawl API.
|
||||||
@ -45,10 +53,8 @@ class FirecrawlApp:
|
|||||||
Exception: If the scrape request fails.
|
Exception: If the scrape request fails.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
headers = {
|
headers = self._prepare_headers()
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
|
||||||
}
|
|
||||||
# Prepare the base scrape parameters with the URL
|
# Prepare the base scrape parameters with the URL
|
||||||
scrape_params = {'url': url}
|
scrape_params = {'url': url}
|
||||||
|
|
||||||
@ -81,13 +87,10 @@ class FirecrawlApp:
|
|||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||||
elif response.status_code in [402, 408, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
self._handle_error(response, 'scrape URL')
|
||||||
|
|
||||||
def search(self, query, params=None):
|
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Perform a search using the Firecrawl API.
|
Perform a search using the Firecrawl API.
|
||||||
|
|
||||||
@ -101,10 +104,7 @@ class FirecrawlApp:
|
|||||||
Raises:
|
Raises:
|
||||||
Exception: If the search request fails.
|
Exception: If the search request fails.
|
||||||
"""
|
"""
|
||||||
headers = {
|
headers = self._prepare_headers()
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
|
||||||
}
|
|
||||||
json_data = {'query': query}
|
json_data = {'query': query}
|
||||||
if params:
|
if params:
|
||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
@ -121,13 +121,14 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||||
|
|
||||||
elif response.status_code in [402, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
self._handle_error(response, 'search')
|
||||||
|
|
||||||
def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None):
|
def crawl_url(self, url: str,
|
||||||
|
params: Optional[Dict[str, Any]] = None,
|
||||||
|
wait_until_done: bool = True,
|
||||||
|
poll_interval: int = 2,
|
||||||
|
idempotency_key: Optional[str] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||||
|
|
||||||
@ -158,7 +159,7 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'start crawl job')
|
self._handle_error(response, 'start crawl job')
|
||||||
|
|
||||||
def check_crawl_status(self, job_id):
|
def check_crawl_status(self, job_id: str) -> Any:
|
||||||
"""
|
"""
|
||||||
Check the status of a crawl job using the Firecrawl API.
|
Check the status of a crawl job using the Firecrawl API.
|
||||||
|
|
||||||
@ -178,7 +179,7 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check crawl status')
|
self._handle_error(response, 'check crawl status')
|
||||||
|
|
||||||
def _prepare_headers(self, idempotency_key=None):
|
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
Prepare the headers for API requests.
|
Prepare the headers for API requests.
|
||||||
|
|
||||||
@ -200,7 +201,11 @@ class FirecrawlApp:
|
|||||||
'Authorization': f'Bearer {self.api_key}',
|
'Authorization': f'Bearer {self.api_key}',
|
||||||
}
|
}
|
||||||
|
|
||||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
def _post_request(self, url: str,
|
||||||
|
data: Dict[str, Any],
|
||||||
|
headers: Dict[str, str],
|
||||||
|
retries: int = 3,
|
||||||
|
backoff_factor: float = 0.5) -> requests.Response:
|
||||||
"""
|
"""
|
||||||
Make a POST request with retries.
|
Make a POST request with retries.
|
||||||
|
|
||||||
@ -225,7 +230,10 @@ class FirecrawlApp:
|
|||||||
return response
|
return response
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
def _get_request(self, url: str,
|
||||||
|
headers: Dict[str, str],
|
||||||
|
retries: int = 3,
|
||||||
|
backoff_factor: float = 0.5) -> requests.Response:
|
||||||
"""
|
"""
|
||||||
Make a GET request with retries.
|
Make a GET request with retries.
|
||||||
|
|
||||||
@ -249,7 +257,7 @@ class FirecrawlApp:
|
|||||||
return response
|
return response
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def _monitor_job_status(self, job_id, headers, poll_interval):
|
def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
|
||||||
"""
|
"""
|
||||||
Monitor the status of a crawl job until completion.
|
Monitor the status of a crawl job until completion.
|
||||||
|
|
||||||
@ -281,7 +289,7 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(status_response, 'check crawl status')
|
self._handle_error(status_response, 'check crawl status')
|
||||||
|
|
||||||
def _handle_error(self, response, action):
|
def _handle_error(self, response: requests.Response, action: str) -> None:
|
||||||
"""
|
"""
|
||||||
Handle errors from API responses.
|
Handle errors from API responses.
|
||||||
|
|
||||||
@ -292,8 +300,19 @@ class FirecrawlApp:
|
|||||||
Raises:
|
Raises:
|
||||||
Exception: An exception with a message containing the status code and error details from the response.
|
Exception: An exception with a message containing the status code and error details from the response.
|
||||||
"""
|
"""
|
||||||
if response.status_code in [402, 408, 409, 500]:
|
error_message = response.json().get('error', 'No additional error details provided.')
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
if response.status_code == 402:
|
||||||
|
message = f"Payment Required: Failed to {action}. {error_message}"
|
||||||
|
elif response.status_code == 408:
|
||||||
|
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
|
||||||
|
elif response.status_code == 409:
|
||||||
|
message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
|
||||||
|
elif response.status_code == 500:
|
||||||
|
message = f"Internal Server Error: Failed to {action}. {error_message}"
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
|
||||||
|
|
||||||
|
# Raise an HTTPError with the custom message and attach the response
|
||||||
|
raise requests.exceptions.HTTPError(message, response=response)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user