0

Nick: return the only list of urls

This commit is contained in:
Nicolas 2024-04-20 11:59:42 -07:00
parent ddf9ff9c9a
commit 1a3aa2999d
2 changed files with 21 additions and 6 deletions

View File

@ -28,6 +28,10 @@ export type WebScraperOptions = {
concurrentRequests?: number;
};
export interface DocumentUrl {
url: string;
}
export class Document {
id?: string;
content: string;

View File

@ -1,8 +1,9 @@
import { Job } from "bull";
import { CrawlResult, WebScraperOptions } from "../types";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { Progress } from "../lib/entities";
import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../lib/entities";
export async function startWebScraperPipeline({
job,
@ -44,7 +45,11 @@ export async function runWebScraper({
onSuccess: (result: any) => void;
onError: (error: any) => void;
team_id: string;
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
}): Promise<{
success: boolean;
message: string;
docs: Document[] | DocumentUrl[];
}> {
try {
const provider = new WebScraperDataProvider();
if (mode === "crawl") {
@ -64,8 +69,7 @@ export async function runWebScraper({
}
const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress);
})) as CrawlResult[];
})) as Document[];
if (docs.length === 0) {
return {
@ -76,7 +80,14 @@ export async function runWebScraper({
}
// remove docs with empty content
const filteredDocs = crawlerOptions.returnOnlyUrls ? docs : docs.filter((doc) => doc.content.trim().length > 0);
const filteredDocs = crawlerOptions.returnOnlyUrls
? docs.map((doc) => {
if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL };
}
})
: docs.filter((doc) => doc.content.trim().length > 0);
onSuccess(filteredDocs);
const { success, credit_usage } = await billTeam(
@ -92,7 +103,7 @@ export async function runWebScraper({
};
}
return { success: true, message: "", docs: filteredDocs as CrawlResult[] };
return { success: true, message: "", docs: filteredDocs };
} catch (error) {
console.error("Error running web scraper", error);
onError(error);