Merge pull request #34 from mendableai/nsc/returnOnlyUrls
Implements the ability for the crawler to output all the links it found, without scraping
This commit is contained in:
commit
f189589da4
@ -13,13 +13,22 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
|||||||
|
|
||||||
|
|
||||||
### Scrape Website
|
### Scrape Website
|
||||||
POST https://api.firecrawl.dev/v0/scrape HTTP/1.1
|
POST http://localhost:3002/v0/crawl HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url":"https://www.mendable.ai"
|
"url":"https://www.mendable.ai",
|
||||||
|
"crawlerOptions": {
|
||||||
|
"returnOnlyUrls": true
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Scrape Website
|
### Scrape Website
|
||||||
@ -34,7 +43,7 @@ content-type: application/json
|
|||||||
|
|
||||||
|
|
||||||
### Check Job Status
|
### Check Job Status
|
||||||
GET http://localhost:3002/v0/crawl/status/333ab225-dc3e-418b-9d4b-8fb833cbaf89 HTTP/1.1
|
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer
|
||||||
|
|
||||||
### Get Job Result
|
### Get Job Result
|
||||||
|
@ -38,6 +38,10 @@ export type WebScraperOptions = {
|
|||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export interface DocumentUrl {
|
||||||
|
url: string;
|
||||||
|
}
|
||||||
|
|
||||||
export class Document {
|
export class Document {
|
||||||
id?: string;
|
id?: string;
|
||||||
url?: string; // Used only in /search for now
|
url?: string; // Used only in /search for now
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import { Job } from "bull";
|
import { Job } from "bull";
|
||||||
import { CrawlResult, WebScraperOptions } from "../types";
|
import { CrawlResult, WebScraperOptions } from "../types";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||||
import { Progress } from "../lib/entities";
|
import { DocumentUrl, Progress } from "../lib/entities";
|
||||||
import { billTeam } from "../services/billing/credit_billing";
|
import { billTeam } from "../services/billing/credit_billing";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
}: {
|
}: {
|
||||||
@ -47,7 +48,7 @@ export async function runWebScraper({
|
|||||||
}): Promise<{
|
}): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
message: string;
|
message: string;
|
||||||
docs: CrawlResult[];
|
docs: Document[] | DocumentUrl[];
|
||||||
}> {
|
}> {
|
||||||
try {
|
try {
|
||||||
const provider = new WebScraperDataProvider();
|
const provider = new WebScraperDataProvider();
|
||||||
@ -68,7 +69,7 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
inProgress(progress);
|
inProgress(progress);
|
||||||
})) as CrawlResult[];
|
})) as Document[];
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return {
|
return {
|
||||||
@ -79,7 +80,14 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// remove docs with empty content
|
// remove docs with empty content
|
||||||
const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0);
|
const filteredDocs = crawlerOptions.returnOnlyUrls
|
||||||
|
? docs.map((doc) => {
|
||||||
|
if (doc.metadata.sourceURL) {
|
||||||
|
return { url: doc.metadata.sourceURL };
|
||||||
|
}
|
||||||
|
})
|
||||||
|
: docs.filter((doc) => doc.content.trim().length > 0);
|
||||||
|
|
||||||
|
|
||||||
const { success, credit_usage } = await billTeam(
|
const { success, credit_usage } = await billTeam(
|
||||||
team_id,
|
team_id,
|
||||||
|
@ -80,11 +80,16 @@ export class WebScraperDataProvider {
|
|||||||
});
|
});
|
||||||
let links = await crawler.start(inProgress, 5, this.limit);
|
let links = await crawler.start(inProgress, 5, this.limit);
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
|
inProgress({
|
||||||
|
current: links.length,
|
||||||
|
total: links.length,
|
||||||
|
status: "COMPLETED",
|
||||||
|
currentDocumentUrl: this.urls[0],
|
||||||
|
});
|
||||||
return links.map((url) => ({
|
return links.map((url) => ({
|
||||||
content: "",
|
content: "",
|
||||||
|
markdown: "",
|
||||||
metadata: { sourceURL: url },
|
metadata: { sourceURL: url },
|
||||||
provider: "web",
|
|
||||||
type: "text",
|
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user