0

Nick: return the only list of urls

This commit is contained in:
Nicolas 2024-04-20 11:59:42 -07:00
parent ddf9ff9c9a
commit 1a3aa2999d
2 changed files with 21 additions and 6 deletions

View File

@ -28,6 +28,10 @@ export type WebScraperOptions = {
concurrentRequests?: number; concurrentRequests?: number;
}; };
export interface DocumentUrl {
url: string;
}
export class Document { export class Document {
id?: string; id?: string;
content: string; content: string;

View File

@ -1,8 +1,9 @@
import { Job } from "bull"; import { Job } from "bull";
import { CrawlResult, WebScraperOptions } from "../types"; import { CrawlResult, WebScraperOptions } from "../types";
import { WebScraperDataProvider } from "../scraper/WebScraper"; import { WebScraperDataProvider } from "../scraper/WebScraper";
import { Progress } from "../lib/entities"; import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing"; import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../lib/entities";
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
job, job,
@ -44,7 +45,11 @@ export async function runWebScraper({
onSuccess: (result: any) => void; onSuccess: (result: any) => void;
onError: (error: any) => void; onError: (error: any) => void;
team_id: string; team_id: string;
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { }): Promise<{
success: boolean;
message: string;
docs: Document[] | DocumentUrl[];
}> {
try { try {
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
if (mode === "crawl") { if (mode === "crawl") {
@ -64,8 +69,7 @@ export async function runWebScraper({
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress); inProgress(progress);
})) as CrawlResult[]; })) as Document[];
if (docs.length === 0) { if (docs.length === 0) {
return { return {
@ -76,7 +80,14 @@ export async function runWebScraper({
} }
// remove docs with empty content // remove docs with empty content
const filteredDocs = crawlerOptions.returnOnlyUrls ? docs : docs.filter((doc) => doc.content.trim().length > 0); const filteredDocs = crawlerOptions.returnOnlyUrls
? docs.map((doc) => {
if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL };
}
})
: docs.filter((doc) => doc.content.trim().length > 0);
onSuccess(filteredDocs); onSuccess(filteredDocs);
const { success, credit_usage } = await billTeam( const { success, credit_usage } = await billTeam(
@ -92,7 +103,7 @@ export async function runWebScraper({
}; };
} }
return { success: true, message: "", docs: filteredDocs as CrawlResult[] }; return { success: true, message: "", docs: filteredDocs };
} catch (error) { } catch (error) {
console.error("Error running web scraper", error); console.error("Error running web scraper", error);
onError(error); onError(error);