Nick: return the only list of urls
This commit is contained in:
parent
ddf9ff9c9a
commit
1a3aa2999d
@ -28,6 +28,10 @@ export type WebScraperOptions = {
|
|||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export interface DocumentUrl {
|
||||||
|
url: string;
|
||||||
|
}
|
||||||
|
|
||||||
export class Document {
|
export class Document {
|
||||||
id?: string;
|
id?: string;
|
||||||
content: string;
|
content: string;
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import { Job } from "bull";
|
import { Job } from "bull";
|
||||||
import { CrawlResult, WebScraperOptions } from "../types";
|
import { CrawlResult, WebScraperOptions } from "../types";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||||
import { Progress } from "../lib/entities";
|
import { DocumentUrl, Progress } from "../lib/entities";
|
||||||
import { billTeam } from "../services/billing/credit_billing";
|
import { billTeam } from "../services/billing/credit_billing";
|
||||||
|
import { Document } from "../lib/entities";
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
@ -44,7 +45,11 @@ export async function runWebScraper({
|
|||||||
onSuccess: (result: any) => void;
|
onSuccess: (result: any) => void;
|
||||||
onError: (error: any) => void;
|
onError: (error: any) => void;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
|
}): Promise<{
|
||||||
|
success: boolean;
|
||||||
|
message: string;
|
||||||
|
docs: Document[] | DocumentUrl[];
|
||||||
|
}> {
|
||||||
try {
|
try {
|
||||||
const provider = new WebScraperDataProvider();
|
const provider = new WebScraperDataProvider();
|
||||||
if (mode === "crawl") {
|
if (mode === "crawl") {
|
||||||
@ -64,8 +69,7 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
inProgress(progress);
|
inProgress(progress);
|
||||||
})) as CrawlResult[];
|
})) as Document[];
|
||||||
|
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return {
|
return {
|
||||||
@ -76,7 +80,14 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// remove docs with empty content
|
// remove docs with empty content
|
||||||
const filteredDocs = crawlerOptions.returnOnlyUrls ? docs : docs.filter((doc) => doc.content.trim().length > 0);
|
const filteredDocs = crawlerOptions.returnOnlyUrls
|
||||||
|
? docs.map((doc) => {
|
||||||
|
if (doc.metadata.sourceURL) {
|
||||||
|
return { url: doc.metadata.sourceURL };
|
||||||
|
}
|
||||||
|
})
|
||||||
|
: docs.filter((doc) => doc.content.trim().length > 0);
|
||||||
|
|
||||||
onSuccess(filteredDocs);
|
onSuccess(filteredDocs);
|
||||||
|
|
||||||
const { success, credit_usage } = await billTeam(
|
const { success, credit_usage } = await billTeam(
|
||||||
@ -92,7 +103,7 @@ export async function runWebScraper({
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return { success: true, message: "", docs: filteredDocs as CrawlResult[] };
|
return { success: true, message: "", docs: filteredDocs };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error running web scraper", error);
|
console.error("Error running web scraper", error);
|
||||||
onError(error);
|
onError(error);
|
||||||
|
Loading…
Reference in New Issue
Block a user