0

Merge pull request #34 from mendableai/nsc/returnOnlyUrls

Implements the ability for the crawler to output all the links it found, without scraping
This commit is contained in:
Rafael Miller 2024-04-24 10:34:42 -03:00 committed by GitHub
commit f189589da4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 35 additions and 9 deletions

View File

@ -13,15 +13,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
### Scrape Website ### Scrape Website
POST https://api.firecrawl.dev/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/crawl HTTP/1.1
Authorization: Bearer Authorization: Bearer
content-type: application/json content-type: application/json
{ {
"url":"https://www.mendable.ai" "url":"https://www.mendable.ai",
"crawlerOptions": {
"returnOnlyUrls": true
}
} }
### Scrape Website ### Scrape Website
POST http://localhost:3002/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer Authorization: Bearer
@ -34,7 +43,7 @@ content-type: application/json
### Check Job Status ### Check Job Status
GET http://localhost:3002/v0/crawl/status/333ab225-dc3e-418b-9d4b-8fb833cbaf89 HTTP/1.1 GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
Authorization: Bearer Authorization: Bearer
### Get Job Result ### Get Job Result

View File

@ -38,6 +38,10 @@ export type WebScraperOptions = {
concurrentRequests?: number; concurrentRequests?: number;
}; };
export interface DocumentUrl {
url: string;
}
export class Document { export class Document {
id?: string; id?: string;
url?: string; // Used only in /search for now url?: string; // Used only in /search for now

View File

@ -1,9 +1,10 @@
import { Job } from "bull"; import { Job } from "bull";
import { CrawlResult, WebScraperOptions } from "../types"; import { CrawlResult, WebScraperOptions } from "../types";
import { WebScraperDataProvider } from "../scraper/WebScraper"; import { WebScraperDataProvider } from "../scraper/WebScraper";
import { Progress } from "../lib/entities"; import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing"; import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../lib/entities"; import { Document } from "../lib/entities";
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
job, job,
}: { }: {
@ -47,7 +48,7 @@ export async function runWebScraper({
}): Promise<{ }): Promise<{
success: boolean; success: boolean;
message: string; message: string;
docs: CrawlResult[]; docs: Document[] | DocumentUrl[];
}> { }> {
try { try {
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
@ -68,7 +69,7 @@ export async function runWebScraper({
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress); inProgress(progress);
})) as CrawlResult[]; })) as Document[];
if (docs.length === 0) { if (docs.length === 0) {
return { return {
@ -79,7 +80,14 @@ export async function runWebScraper({
} }
// remove docs with empty content // remove docs with empty content
const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); const filteredDocs = crawlerOptions.returnOnlyUrls
? docs.map((doc) => {
if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL };
}
})
: docs.filter((doc) => doc.content.trim().length > 0);
const { success, credit_usage } = await billTeam( const { success, credit_usage } = await billTeam(
team_id, team_id,

View File

@ -80,11 +80,16 @@ export class WebScraperDataProvider {
}); });
let links = await crawler.start(inProgress, 5, this.limit); let links = await crawler.start(inProgress, 5, this.limit);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
inProgress({
current: links.length,
total: links.length,
status: "COMPLETED",
currentDocumentUrl: this.urls[0],
});
return links.map((url) => ({ return links.map((url) => ({
content: "", content: "",
markdown: "",
metadata: { sourceURL: url }, metadata: { sourceURL: url },
provider: "web",
type: "text",
})); }));
} }