Merge branch 'main' into feat/test-suite
This commit is contained in:
commit
056b0ec24d
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
Crawl and convert any website into LLM-ready markdown. Built by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) and the firecrawl community.
|
Crawl and convert any website into LLM-ready markdown. Built by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) and the firecrawl community.
|
||||||
|
|
||||||
_This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not completely ready for full self-host deployment yet, but you can already run it locally! - we're working on it_
|
_This repository is in its early development stages. We are still merging custom modules in the mono repo. It's not completely yet ready for full self-host deployment, but you can already run it locally._
|
||||||
|
|
||||||
## What is Firecrawl?
|
## What is Firecrawl?
|
||||||
|
|
||||||
@ -261,5 +261,4 @@ search_result = app.search(query)
|
|||||||
|
|
||||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||||
|
|
||||||
|
|
||||||
*It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.*
|
*It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.*
|
||||||
|
@ -17,11 +17,16 @@ kill_timeout = '5s'
|
|||||||
[http_service]
|
[http_service]
|
||||||
internal_port = 8080
|
internal_port = 8080
|
||||||
force_https = true
|
force_https = true
|
||||||
auto_stop_machines = true
|
auto_stop_machines = false
|
||||||
auto_start_machines = true
|
auto_start_machines = true
|
||||||
min_machines_running = 0
|
min_machines_running = 2
|
||||||
processes = ['app']
|
processes = ['app']
|
||||||
|
|
||||||
|
[http_service.concurrency]
|
||||||
|
type = "requests"
|
||||||
|
hard_limit = 200
|
||||||
|
soft_limit = 100
|
||||||
|
|
||||||
[[services]]
|
[[services]]
|
||||||
protocol = 'tcp'
|
protocol = 'tcp'
|
||||||
internal_port = 8080
|
internal_port = 8080
|
||||||
@ -38,10 +43,14 @@ kill_timeout = '5s'
|
|||||||
|
|
||||||
[services.concurrency]
|
[services.concurrency]
|
||||||
type = 'connections'
|
type = 'connections'
|
||||||
hard_limit = 45
|
hard_limit = 75
|
||||||
soft_limit = 20
|
soft_limit = 30
|
||||||
|
|
||||||
[[vm]]
|
[[vm]]
|
||||||
size = 'performance-1x'
|
size = 'performance-4x'
|
||||||
|
processes = ['app']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -79,8 +79,26 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("content");
|
expect(response.body.data).toHaveProperty("content");
|
||||||
expect(response.body.data).toHaveProperty("markdown");
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
expect(response.body.data.content).toContain("🔥 FireCrawl");
|
expect(response.body.data.content).toContain("🔥 FireCrawl");
|
||||||
}, 30000); // 30 seconds timeout
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
|
it("should return a successful response with a valid API key and includeHtml set to true", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true }});
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("content");
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("html");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data.content).toContain("🔥 FireCrawl");
|
||||||
|
expect(response.body.data.markdown).toContain("🔥 FireCrawl");
|
||||||
|
expect(response.body.data.html).toContain("<h1");
|
||||||
|
}, 30000); // 30 seconds timeout
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawl", () => {
|
describe("POST /v0/crawl", () => {
|
||||||
@ -143,16 +161,17 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return an error for a blocklisted URL", async () => {
|
// it("should return an error for a blocklisted URL", async () => {
|
||||||
const blocklistedUrl = "https://instagram.com/fake-test";
|
// const blocklistedUrl = "https://instagram.com/fake-test";
|
||||||
const response = await request(TEST_URL)
|
// const response = await request(TEST_URL)
|
||||||
.post("/v0/crawlWebsitePreview")
|
// .post("/v0/crawlWebsitePreview")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
// .set("Content-Type", "application/json")
|
||||||
.send({ url: blocklistedUrl });
|
// .send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
// // is returning 429 instead of 403
|
||||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
// expect(response.statusCode).toBe(403);
|
||||||
});
|
// expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
// });
|
||||||
|
|
||||||
it("should return a successful response with a valid API key", async () => {
|
it("should return a successful response with a valid API key", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
@ -250,8 +269,87 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
"🔥 FireCrawl"
|
"🔥 FireCrawl"
|
||||||
);
|
);
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true } });
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
expect(response.body.status).toBe("active");
|
||||||
|
|
||||||
|
// wait for 30 seconds
|
||||||
|
await new Promise((r) => setTimeout(r, 30000));
|
||||||
|
|
||||||
|
const completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].content).toContain(
|
||||||
|
"🔥 FireCrawl"
|
||||||
|
);
|
||||||
|
expect(completedResponse.body.data[0].markdown).toContain(
|
||||||
|
"FireCrawl"
|
||||||
|
);
|
||||||
|
expect(completedResponse.body.data[0].html).toContain(
|
||||||
|
"<h1"
|
||||||
|
);
|
||||||
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("If someone cancels a crawl job, it should turn into failed status", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://jestjs.io" });
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// wait for 30 seconds
|
||||||
|
await new Promise((r) => setTimeout(r, 10000));
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
expect(response.body.status).toBe("cancelled");
|
||||||
|
|
||||||
|
await new Promise((r) => setTimeout(r, 20000));
|
||||||
|
|
||||||
|
const completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("failed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data).toEqual(null);
|
||||||
|
expect(completedResponse.body).toHaveProperty("partial_data");
|
||||||
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
|
||||||
|
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
it("should extract data using LLM extraction mode", async () => {
|
it("should extract data using LLM extraction mode", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
|
62
apps/api/src/controllers/crawl-cancel.ts
Normal file
62
apps/api/src/controllers/crawl-cancel.ts
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import { Request, Response } from "express";
|
||||||
|
import { authenticateUser } from "./auth";
|
||||||
|
import { RateLimiterMode } from "../../src/types";
|
||||||
|
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||||
|
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||||
|
import { supabase_service } from "../../src/services/supabase";
|
||||||
|
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||||
|
|
||||||
|
export async function crawlCancelController(req: Request, res: Response) {
|
||||||
|
try {
|
||||||
|
const { success, team_id, error, status } = await authenticateUser(
|
||||||
|
req,
|
||||||
|
res,
|
||||||
|
RateLimiterMode.CrawlStatus
|
||||||
|
);
|
||||||
|
if (!success) {
|
||||||
|
return res.status(status).json({ error });
|
||||||
|
}
|
||||||
|
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||||
|
if (!job) {
|
||||||
|
return res.status(404).json({ error: "Job not found" });
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if the job belongs to the team
|
||||||
|
const { data, error: supaError } = await supabase_service
|
||||||
|
.from("bulljobs_teams")
|
||||||
|
.select("*")
|
||||||
|
.eq("job_id", req.params.jobId)
|
||||||
|
.eq("team_id", team_id);
|
||||||
|
if (supaError) {
|
||||||
|
return res.status(500).json({ error: supaError.message });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.length === 0) {
|
||||||
|
return res.status(403).json({ error: "Unauthorized" });
|
||||||
|
}
|
||||||
|
const jobState = await job.getState();
|
||||||
|
const { partialDocs } = await job.progress();
|
||||||
|
|
||||||
|
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
||||||
|
console.log("Billing team for partial docs...");
|
||||||
|
// Note: the credits that we will bill them here might be lower than the actual
|
||||||
|
// due to promises that are not yet resolved
|
||||||
|
await billTeam(team_id, partialDocs.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await job.moveToFailed(Error("Job cancelled by user"), true);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
const newJobState = await job.getState();
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
status: newJobState === "failed" ? "cancelled" : "Cancelling...",
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
return res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
}
|
@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { current, current_url, total, current_step } = await job.progress();
|
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||||
res.json({
|
res.json({
|
||||||
status: await job.getState(),
|
status: await job.getState(),
|
||||||
// progress: job.progress(),
|
// progress: job.progress(),
|
||||||
@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
current_step: current_step,
|
current_step: current_step,
|
||||||
total: total,
|
total: total,
|
||||||
data: job.returnvalue,
|
data: job.returnvalue,
|
||||||
|
partial_data: partialDocs ?? [],
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
|
@ -6,6 +6,7 @@ import { authenticateUser } from "./auth";
|
|||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||||
|
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -30,12 +31,17 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
return res
|
||||||
|
.status(403)
|
||||||
|
.json({
|
||||||
|
error:
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
@ -66,6 +72,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
mode: mode ?? "crawl", // fix for single urls not working
|
mode: mode ?? "crawl", // fix for single urls not working
|
||||||
@ -75,6 +82,8 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
origin: req.body.origin ?? "api",
|
origin: req.body.origin ?? "api",
|
||||||
});
|
});
|
||||||
|
|
||||||
|
await logCrawl(job.id.toString(), team_id);
|
||||||
|
|
||||||
res.json({ jobId: job.id });
|
res.json({ jobId: job.id });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
|
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { ExtractorOptions } from './../lib/entities';
|
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
@ -13,8 +13,8 @@ export async function scrapeHelper(
|
|||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: any,
|
pageOptions: PageOptions,
|
||||||
extractorOptions: ExtractorOptions
|
extractorOptions: ExtractorOptions,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -39,7 +39,7 @@ export async function scrapeHelper(
|
|||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
},
|
},
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
extractorOptions: extractorOptions
|
extractorOptions: extractorOptions,
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
const docs = await a.getDocuments(false);
|
||||||
@ -91,7 +91,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||||
const extractorOptions = req.body.extractorOptions ?? {
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
}
|
||||||
@ -113,7 +113,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
team_id,
|
team_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions
|
extractorOptions,
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
@ -132,7 +132,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
extractor_options: extractorOptions,
|
extractor_options: extractorOptions,
|
||||||
num_tokens: numTokens
|
num_tokens: numTokens,
|
||||||
});
|
});
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -13,7 +13,7 @@ export async function searchHelper(
|
|||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
searchOptions: SearchOptions
|
searchOptions: SearchOptions,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -54,10 +54,11 @@ export async function searchHelper(
|
|||||||
|
|
||||||
// filter out social media links
|
// filter out social media links
|
||||||
|
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: res.map((r) => r.url),
|
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
},
|
},
|
||||||
@ -65,11 +66,12 @@ export async function searchHelper(
|
|||||||
...pageOptions,
|
...pageOptions,
|
||||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||||
|
includeHtml: pageOptions?.includeHtml ?? false,
|
||||||
fallback: false,
|
fallback: false,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(true);
|
const docs = await a.getDocuments(false);
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
}
|
}
|
||||||
@ -116,6 +118,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
includeHtml: false,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
fetchPageContent: true,
|
fetchPageContent: true,
|
||||||
fallback: false,
|
fallback: false,
|
||||||
@ -140,14 +143,14 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
team_id,
|
team_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
searchOptions
|
searchOptions,
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
logJob({
|
logJob({
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
num_docs: result.data.length,
|
num_docs: result.data ? result.data.length : 0,
|
||||||
docs: result.data,
|
docs: result.data,
|
||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
|
@ -8,7 +8,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { current, current_url, total, current_step } = await job.progress();
|
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||||
res.json({
|
res.json({
|
||||||
status: await job.getState(),
|
status: await job.getState(),
|
||||||
// progress: job.progress(),
|
// progress: job.progress(),
|
||||||
@ -17,6 +17,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||||||
current_step: current_step,
|
current_step: current_step,
|
||||||
total: total,
|
total: total,
|
||||||
data: job.returnvalue,
|
data: job.returnvalue,
|
||||||
|
partial_data: partialDocs ?? [],
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
|
@ -7,13 +7,14 @@ export interface Progress {
|
|||||||
[key: string]: any;
|
[key: string]: any;
|
||||||
};
|
};
|
||||||
currentDocumentUrl?: string;
|
currentDocumentUrl?: string;
|
||||||
|
currentDocument?: Document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
|
includeHtml?: boolean;
|
||||||
fallback?: boolean;
|
fallback?: boolean;
|
||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
@ -46,6 +47,7 @@ export type WebScraperOptions = {
|
|||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
extractorOptions?: ExtractorOptions;
|
extractorOptions?: ExtractorOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
|
bullJobId?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export interface DocumentUrl {
|
export interface DocumentUrl {
|
||||||
|
@ -10,13 +10,15 @@ export async function startWebScraperPipeline({
|
|||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions>;
|
job: Job<WebScraperOptions>;
|
||||||
}) {
|
}) {
|
||||||
|
let partialDocs: Document[] = [];
|
||||||
return (await runWebScraper({
|
return (await runWebScraper({
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
pageOptions: job.data.pageOptions,
|
pageOptions: job.data.pageOptions,
|
||||||
inProgress: (progress) => {
|
inProgress: (progress) => {
|
||||||
job.progress(progress);
|
partialDocs.push(progress.currentDocument);
|
||||||
|
job.progress({...progress, partialDocs: partialDocs});
|
||||||
},
|
},
|
||||||
onSuccess: (result) => {
|
onSuccess: (result) => {
|
||||||
job.moveToCompleted(result);
|
job.moveToCompleted(result);
|
||||||
@ -25,6 +27,7 @@ export async function startWebScraperPipeline({
|
|||||||
job.moveToFailed(error);
|
job.moveToFailed(error);
|
||||||
},
|
},
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
|
bull_job_id: job.id.toString()
|
||||||
})) as { success: boolean; message: string; docs: Document[] };
|
})) as { success: boolean; message: string; docs: Document[] };
|
||||||
}
|
}
|
||||||
export async function runWebScraper({
|
export async function runWebScraper({
|
||||||
@ -36,6 +39,7 @@ export async function runWebScraper({
|
|||||||
onSuccess,
|
onSuccess,
|
||||||
onError,
|
onError,
|
||||||
team_id,
|
team_id,
|
||||||
|
bull_job_id,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
mode: "crawl" | "single_urls" | "sitemap";
|
mode: "crawl" | "single_urls" | "sitemap";
|
||||||
@ -45,6 +49,7 @@ export async function runWebScraper({
|
|||||||
onSuccess: (result: any) => void;
|
onSuccess: (result: any) => void;
|
||||||
onError: (error: any) => void;
|
onError: (error: any) => void;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
|
bull_job_id: string;
|
||||||
}): Promise<{
|
}): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
message: string;
|
message: string;
|
||||||
@ -58,17 +63,19 @@ export async function runWebScraper({
|
|||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
bullJobId: bull_job_id
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: url.split(","),
|
urls: url.split(","),
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
inProgress(progress);
|
inProgress(progress);
|
||||||
|
|
||||||
})) as Document[];
|
})) as Document[];
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
|
@ -5,6 +5,7 @@ import { scrapeController } from "../../src/controllers/scrape";
|
|||||||
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
||||||
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||||
import { searchController } from "../../src/controllers/search";
|
import { searchController } from "../../src/controllers/search";
|
||||||
|
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
|
||||||
|
|
||||||
export const v0Router = express.Router();
|
export const v0Router = express.Router();
|
||||||
|
|
||||||
@ -12,6 +13,7 @@ v0Router.post("/v0/scrape", scrapeController);
|
|||||||
v0Router.post("/v0/crawl", crawlController);
|
v0Router.post("/v0/crawl", crawlController);
|
||||||
v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
|
v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
|
||||||
v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
|
v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
|
||||||
|
v0Router.delete("/v0/crawl/cancel/:jobId", crawlCancelController);
|
||||||
v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||||
|
|
||||||
// Search routes
|
// Search routes
|
||||||
|
@ -1,4 +1,9 @@
|
|||||||
import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
|
import {
|
||||||
|
Document,
|
||||||
|
ExtractorOptions,
|
||||||
|
PageOptions,
|
||||||
|
WebScraperOptions,
|
||||||
|
} from "../../lib/entities";
|
||||||
import { Progress } from "../../lib/entities";
|
import { Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl } from "./single_url";
|
import { scrapSingleUrl } from "./single_url";
|
||||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||||
@ -6,12 +11,15 @@ import { WebCrawler } from "./crawler";
|
|||||||
import { getValue, setValue } from "../../services/redis";
|
import { getValue, setValue } from "../../services/redis";
|
||||||
import { getImageDescription } from "./utils/imageDescription";
|
import { getImageDescription } from "./utils/imageDescription";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
import {
|
||||||
import OpenAI from 'openai'
|
replaceImgPathsWithAbsolutePaths,
|
||||||
|
replacePathsWithAbsolutePaths,
|
||||||
|
} from "./utils/replacePaths";
|
||||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||||
|
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
|
private bullJobId: string;
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||||
private includes: string[];
|
private includes: string[];
|
||||||
@ -24,7 +32,8 @@ export class WebScraperDataProvider {
|
|||||||
private pageOptions?: PageOptions;
|
private pageOptions?: PageOptions;
|
||||||
private extractorOptions?: ExtractorOptions;
|
private extractorOptions?: ExtractorOptions;
|
||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
|
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||||
|
"gpt-4-turbo";
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -46,7 +55,7 @@ export class WebScraperDataProvider {
|
|||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const result = await scrapSingleUrl(url, true, this.pageOptions);
|
const result = await scrapSingleUrl(url, this.pageOptions);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
@ -54,11 +63,26 @@ export class WebScraperDataProvider {
|
|||||||
total: totalUrls,
|
total: totalUrls,
|
||||||
status: "SCRAPING",
|
status: "SCRAPING",
|
||||||
currentDocumentUrl: url,
|
currentDocumentUrl: url,
|
||||||
|
currentDocument: result,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
results[i + index] = result;
|
results[i + index] = result;
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
try {
|
||||||
|
if (this.mode === "crawl" && this.bullJobId) {
|
||||||
|
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
||||||
|
const jobStatus = await job.getState();
|
||||||
|
if (jobStatus === "failed") {
|
||||||
|
throw new Error(
|
||||||
|
"Job has failed or has been cancelled by the user. Stopping the job..."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return results.filter((result) => result !== null) as Document[];
|
return results.filter((result) => result !== null) as Document[];
|
||||||
}
|
}
|
||||||
@ -67,13 +91,44 @@ export class WebScraperDataProvider {
|
|||||||
useCaching: boolean = false,
|
useCaching: boolean = false,
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
|
this.validateInitialUrl();
|
||||||
|
|
||||||
|
if (!useCaching) {
|
||||||
|
return this.processDocumentsWithoutCache(inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.processDocumentsWithCache(inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
private validateInitialUrl(): void {
|
||||||
if (this.urls[0].trim() === "") {
|
if (this.urls[0].trim() === "") {
|
||||||
throw new Error("Url is required");
|
throw new Error("Url is required");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!useCaching) {
|
/**
|
||||||
if (this.mode === "crawl") {
|
* Process documents without cache handling each mode
|
||||||
|
* @param inProgress inProgress
|
||||||
|
* @returns documents
|
||||||
|
*/
|
||||||
|
private async processDocumentsWithoutCache(
|
||||||
|
inProgress?: (progress: Progress) => void
|
||||||
|
): Promise<Document[]> {
|
||||||
|
switch (this.mode) {
|
||||||
|
case "crawl":
|
||||||
|
return this.handleCrawlMode(inProgress);
|
||||||
|
case "single_urls":
|
||||||
|
return this.handleSingleUrlsMode(inProgress);
|
||||||
|
case "sitemap":
|
||||||
|
return this.handleSitemapMode(inProgress);
|
||||||
|
default:
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleCrawlMode(
|
||||||
|
inProgress?: (progress: Progress) => void
|
||||||
|
): Promise<Document[]> {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
@ -84,7 +139,37 @@ export class WebScraperDataProvider {
|
|||||||
});
|
});
|
||||||
let links = await crawler.start(inProgress, 5, this.limit);
|
let links = await crawler.start(inProgress, 5, this.limit);
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
inProgress({
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
let documents = await this.processLinks(links, inProgress);
|
||||||
|
return this.cacheAndFinalizeDocuments(documents, links);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleSingleUrlsMode(
|
||||||
|
inProgress?: (progress: Progress) => void
|
||||||
|
): Promise<Document[]> {
|
||||||
|
let documents = await this.processLinks(this.urls, inProgress);
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleSitemapMode(
|
||||||
|
inProgress?: (progress: Progress) => void
|
||||||
|
): Promise<Document[]> {
|
||||||
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
|
if (this.returnOnlyUrls) {
|
||||||
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
let documents = await this.processLinks(links, inProgress);
|
||||||
|
return this.cacheAndFinalizeDocuments(documents, links);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async returnOnlyUrlsResponse(
|
||||||
|
links: string[],
|
||||||
|
inProgress?: (progress: Progress) => void
|
||||||
|
): Promise<Document[]> {
|
||||||
|
inProgress?.({
|
||||||
current: links.length,
|
current: links.length,
|
||||||
total: links.length,
|
total: links.length,
|
||||||
status: "COMPLETED",
|
status: "COMPLETED",
|
||||||
@ -92,162 +177,71 @@ export class WebScraperDataProvider {
|
|||||||
});
|
});
|
||||||
return links.map((url) => ({
|
return links.map((url) => ({
|
||||||
content: "",
|
content: "",
|
||||||
|
html: this.pageOptions?.includeHtml ? "" : undefined,
|
||||||
markdown: "",
|
markdown: "",
|
||||||
metadata: { sourceURL: url },
|
metadata: { sourceURL: url },
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async processLinks(
|
||||||
|
links: string[],
|
||||||
|
inProgress?: (progress: Progress) => void
|
||||||
|
): Promise<Document[]> {
|
||||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||||
for (let pdfLink of pdfLinks) {
|
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
|
||||||
pdfDocuments.push({
|
|
||||||
content: pdfContent,
|
|
||||||
metadata: { sourceURL: pdfLink },
|
|
||||||
provider: "web-scraper"
|
|
||||||
});
|
|
||||||
}
|
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
documents = this.applyPathReplacements(documents);
|
||||||
|
documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
|
||||||
documents = replacePathsWithAbsolutePaths(documents);
|
|
||||||
} else {
|
|
||||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
|
||||||
documents = await this.generatesImgAltText(documents);
|
|
||||||
}
|
|
||||||
documents = documents.concat(pdfDocuments);
|
|
||||||
|
|
||||||
// CACHING DOCUMENTS
|
|
||||||
// - parent document
|
|
||||||
const cachedParentDocumentString = await getValue(
|
|
||||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
|
|
||||||
);
|
|
||||||
if (cachedParentDocumentString != null) {
|
|
||||||
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
|
|
||||||
if (
|
if (
|
||||||
!cachedParentDocument.childrenLinks ||
|
this.extractorOptions.mode === "llm-extraction" &&
|
||||||
cachedParentDocument.childrenLinks.length < links.length - 1
|
this.mode === "single_urls"
|
||||||
) {
|
) {
|
||||||
cachedParentDocument.childrenLinks = links.filter(
|
documents = await generateCompletions(documents, this.extractorOptions);
|
||||||
(link) => link !== this.urls[0]
|
|
||||||
);
|
|
||||||
await setValue(
|
|
||||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
|
|
||||||
JSON.stringify(cachedParentDocument),
|
|
||||||
60 * 60 * 24 * 10
|
|
||||||
); // 10 days
|
|
||||||
}
|
}
|
||||||
} else {
|
return documents.concat(pdfDocuments);
|
||||||
let parentDocument = documents.filter(
|
|
||||||
(document) =>
|
|
||||||
this.normalizeUrl(document.metadata.sourceURL) ===
|
|
||||||
this.normalizeUrl(this.urls[0])
|
|
||||||
);
|
|
||||||
await this.setCachedDocuments(parentDocument, links);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.setCachedDocuments(
|
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||||
documents.filter(
|
return Promise.all(
|
||||||
(document) =>
|
pdfLinks.map(async (pdfLink) => {
|
||||||
this.normalizeUrl(document.metadata.sourceURL) !==
|
|
||||||
this.normalizeUrl(this.urls[0])
|
|
||||||
),
|
|
||||||
[]
|
|
||||||
);
|
|
||||||
documents = this.removeChildLinks(documents);
|
|
||||||
documents = documents.splice(0, this.limit);
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.mode === "single_urls") {
|
|
||||||
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
|
|
||||||
let pdfDocuments: Document[] = [];
|
|
||||||
for (let pdfLink of pdfLinks) {
|
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
pdfDocuments.push({
|
return {
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
metadata: { sourceURL: pdfLink },
|
metadata: { sourceURL: pdfLink },
|
||||||
provider: "web-scraper"
|
provider: "web-scraper",
|
||||||
});
|
};
|
||||||
}
|
})
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
|
||||||
this.urls.filter((link) => !link.endsWith(".pdf")),
|
|
||||||
inProgress
|
|
||||||
);
|
);
|
||||||
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
|
||||||
documents = replacePathsWithAbsolutePaths(documents);
|
|
||||||
} else {
|
|
||||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
private applyPathReplacements(documents: Document[]): Document[] {
|
||||||
documents = await this.generatesImgAltText(documents);
|
return this.replaceAllPathsWithAbsolutePaths
|
||||||
}
|
? replacePathsWithAbsolutePaths(documents)
|
||||||
const baseUrl = new URL(this.urls[0]).origin;
|
: replaceImgPathsWithAbsolutePaths(documents);
|
||||||
documents = await this.getSitemapData(baseUrl, documents);
|
|
||||||
documents = documents.concat(pdfDocuments);
|
|
||||||
|
|
||||||
if(this.extractorOptions.mode === "llm-extraction") {
|
|
||||||
documents = await generateCompletions(
|
|
||||||
documents,
|
|
||||||
this.extractorOptions
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||||
|
return this.generateImgAltText
|
||||||
|
? this.generatesImgAltText(documents)
|
||||||
|
: documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async cacheAndFinalizeDocuments(
|
||||||
|
documents: Document[],
|
||||||
|
links: string[]
|
||||||
|
): Promise<Document[]> {
|
||||||
|
await this.setCachedDocuments(documents, links);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
documents = documents.splice(0, this.limit);
|
return documents.splice(0, this.limit);
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
if (this.mode === "sitemap") {
|
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
|
||||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
|
||||||
let pdfDocuments: Document[] = [];
|
|
||||||
for (let pdfLink of pdfLinks) {
|
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
|
||||||
pdfDocuments.push({
|
|
||||||
content: pdfContent,
|
|
||||||
metadata: { sourceURL: pdfLink },
|
|
||||||
provider: "web-scraper"
|
|
||||||
});
|
|
||||||
}
|
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
|
||||||
links.slice(0, this.limit),
|
|
||||||
inProgress
|
|
||||||
);
|
|
||||||
|
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
|
||||||
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
|
||||||
documents = replacePathsWithAbsolutePaths(documents);
|
|
||||||
} else {
|
|
||||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
|
||||||
documents = await this.generatesImgAltText(documents);
|
|
||||||
}
|
|
||||||
documents = documents.concat(pdfDocuments);
|
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
|
||||||
documents = this.removeChildLinks(documents);
|
|
||||||
documents = documents.splice(0, this.limit);
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
return [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async processDocumentsWithCache(
|
||||||
|
inProgress?: (progress: Progress) => void
|
||||||
|
): Promise<Document[]> {
|
||||||
let documents = await this.getCachedDocuments(
|
let documents = await this.getCachedDocuments(
|
||||||
this.urls.slice(0, this.limit)
|
this.urls.slice(0, this.limit)
|
||||||
);
|
);
|
||||||
@ -256,22 +250,29 @@ export class WebScraperDataProvider {
|
|||||||
false,
|
false,
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
documents = this.mergeNewDocuments(documents, newDocuments);
|
||||||
|
}
|
||||||
|
documents = this.filterDocsExcludeInclude(documents);
|
||||||
|
documents = this.removeChildLinks(documents);
|
||||||
|
return documents.splice(0, this.limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
private mergeNewDocuments(
|
||||||
|
existingDocuments: Document[],
|
||||||
|
newDocuments: Document[]
|
||||||
|
): Document[] {
|
||||||
newDocuments.forEach((doc) => {
|
newDocuments.forEach((doc) => {
|
||||||
if (
|
if (
|
||||||
!documents.some(
|
!existingDocuments.some(
|
||||||
(d) =>
|
(d) =>
|
||||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
this.normalizeUrl(d.metadata.sourceURL) ===
|
||||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
this.normalizeUrl(doc.metadata?.sourceURL)
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
documents.push(doc);
|
existingDocuments.push(doc);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
return existingDocuments;
|
||||||
documents = this.filterDocsExcludeInclude(documents);
|
|
||||||
documents = this.removeChildLinks(documents);
|
|
||||||
documents = documents.splice(0, this.limit);
|
|
||||||
return documents;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
||||||
@ -348,7 +349,7 @@ export class WebScraperDataProvider {
|
|||||||
documents.push(cachedDocument);
|
documents.push(cachedDocument);
|
||||||
|
|
||||||
// get children documents
|
// get children documents
|
||||||
for (const childUrl of cachedDocument.childrenLinks) {
|
for (const childUrl of cachedDocument.childrenLinks || []) {
|
||||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||||
const childCachedDocumentString = await getValue(
|
const childCachedDocumentString = await getValue(
|
||||||
"web-scraper-cache:" + normalizedChildUrl
|
"web-scraper-cache:" + normalizedChildUrl
|
||||||
@ -376,6 +377,7 @@ export class WebScraperDataProvider {
|
|||||||
throw new Error("Urls are required");
|
throw new Error("Urls are required");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.bullJobId = options.bullJobId;
|
||||||
this.urls = options.urls;
|
this.urls = options.urls;
|
||||||
this.mode = options.mode;
|
this.mode = options.mode;
|
||||||
this.concurrentRequests = options.concurrentRequests ?? 20;
|
this.concurrentRequests = options.concurrentRequests ?? 20;
|
||||||
@ -386,10 +388,9 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
|
|
||||||
@ -459,8 +460,9 @@ export class WebScraperDataProvider {
|
|||||||
altText = await getImageDescription(
|
altText = await getImageDescription(
|
||||||
imageUrl,
|
imageUrl,
|
||||||
backText,
|
backText,
|
||||||
frontText
|
frontText,
|
||||||
, this.generateImgAltTextModel);
|
this.generateImgAltTextModel
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.content = document.content.replace(
|
document.content = document.content.replace(
|
||||||
|
@ -103,8 +103,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
|||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
toMarkdown: boolean = true,
|
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
||||||
pageOptions: PageOptions = { onlyMainContent: true }
|
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
@ -193,6 +192,7 @@ export async function scrapSingleUrl(
|
|||||||
url: urlToScrap,
|
url: urlToScrap,
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||||
} as Document;
|
} as Document;
|
||||||
}
|
}
|
||||||
@ -216,6 +216,7 @@ export async function scrapSingleUrl(
|
|||||||
return {
|
return {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||||
} as Document;
|
} as Document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -223,6 +224,7 @@ export async function scrapSingleUrl(
|
|||||||
return {
|
return {
|
||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
|
html: "",
|
||||||
metadata: { sourceURL: urlToScrap },
|
metadata: { sourceURL: urlToScrap },
|
||||||
} as Document;
|
} as Document;
|
||||||
}
|
}
|
||||||
|
17
apps/api/src/services/logging/crawl_log.ts
Normal file
17
apps/api/src/services/logging/crawl_log.ts
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import { supabase_service } from "../supabase";
|
||||||
|
import "dotenv/config";
|
||||||
|
|
||||||
|
export async function logCrawl(job_id: string, team_id: string) {
|
||||||
|
try {
|
||||||
|
const { data, error } = await supabase_service
|
||||||
|
.from("bulljobs_teams")
|
||||||
|
.insert([
|
||||||
|
{
|
||||||
|
job_id: job_id,
|
||||||
|
team_id: team_id,
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error logging crawl job:\n", error);
|
||||||
|
}
|
||||||
|
}
|
@ -40,7 +40,7 @@ export interface FirecrawlJob {
|
|||||||
pageOptions?: any;
|
pageOptions?: any;
|
||||||
origin: string;
|
origin: string;
|
||||||
extractor_options?: ExtractorOptions,
|
extractor_options?: ExtractorOptions,
|
||||||
num_tokens?: number
|
num_tokens?: number,
|
||||||
}
|
}
|
||||||
|
|
||||||
export enum RateLimiterMode {
|
export enum RateLimiterMode {
|
||||||
|
Loading…
Reference in New Issue
Block a user