Nick: 4x speed
This commit is contained in:
parent
e26008a833
commit
a96fc5b96d
@ -44,6 +44,7 @@ export type WebScraperOptions = {
|
||||
limit?: number;
|
||||
generateImgAltText?: boolean;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
fastMode?: boolean; // have a mode of some sort
|
||||
};
|
||||
pageOptions?: PageOptions;
|
||||
extractorOptions?: ExtractorOptions;
|
||||
|
@ -4,7 +4,7 @@ import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
import async from "async";
|
||||
import { Progress } from "../../lib/entities";
|
||||
import { scrapWithScrapingBee } from "./single_url";
|
||||
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||
import robotsParser from "robots-parser";
|
||||
|
||||
export class WebCrawler {
|
||||
@ -15,11 +15,12 @@ export class WebCrawler {
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number;
|
||||
private visited: Set<string> = new Set();
|
||||
private crawledUrls: Set<string> = new Set();
|
||||
private crawledUrls: { url: string, html: string }[] = [];
|
||||
private limit: number;
|
||||
private robotsTxtUrl: string;
|
||||
private robots: any;
|
||||
private generateImgAltText: boolean;
|
||||
private fastMode: boolean = false;
|
||||
|
||||
constructor({
|
||||
initialUrl,
|
||||
@ -49,9 +50,9 @@ export class WebCrawler {
|
||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
||||
this.generateImgAltText = generateImgAltText ?? false;
|
||||
this.fastMode = false;
|
||||
}
|
||||
|
||||
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
@ -99,7 +100,7 @@ export class WebCrawler {
|
||||
concurrencyLimit: number = 5,
|
||||
limit: number = 10000,
|
||||
maxDepth: number = 10
|
||||
): Promise<string[]> {
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const response = await axios.get(this.robotsTxtUrl);
|
||||
@ -111,7 +112,7 @@ export class WebCrawler {
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||
return filteredLinks;
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
|
||||
const urls = await this.crawlUrls(
|
||||
@ -123,43 +124,44 @@ export class WebCrawler {
|
||||
urls.length === 0 &&
|
||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||
) {
|
||||
return [this.initialUrl];
|
||||
return [{ url: this.initialUrl, html: "" }];
|
||||
}
|
||||
|
||||
// make sure to run include exclude here again
|
||||
return this.filterLinks(urls, limit, this.maxCrawledDepth);
|
||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||
}
|
||||
|
||||
private async crawlUrls(
|
||||
urls: string[],
|
||||
concurrencyLimit: number,
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<string[]> {
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
const queue = async.queue(async (task: string, callback) => {
|
||||
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
||||
if (this.crawledUrls.length >= this.maxCrawledLinks) {
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
return;
|
||||
}
|
||||
const newUrls = await this.crawl(task);
|
||||
newUrls.forEach((url) => this.crawledUrls.add(url));
|
||||
newUrls.forEach((page) => this.crawledUrls.push(page));
|
||||
if (inProgress && newUrls.length > 0) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
current: this.crawledUrls.length,
|
||||
total: this.maxCrawledLinks,
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: newUrls[newUrls.length - 1],
|
||||
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
||||
});
|
||||
} else if (inProgress) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
current: this.crawledUrls.length,
|
||||
total: this.maxCrawledLinks,
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: task,
|
||||
});
|
||||
}
|
||||
await this.crawlUrls(newUrls, concurrencyLimit, inProgress);
|
||||
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
@ -175,10 +177,10 @@ export class WebCrawler {
|
||||
}
|
||||
);
|
||||
await queue.drain();
|
||||
return Array.from(this.crawledUrls);
|
||||
return this.crawledUrls;
|
||||
}
|
||||
|
||||
async crawl(url: string): Promise<string[]> {
|
||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
|
||||
return [];
|
||||
this.visited.add(url);
|
||||
@ -193,16 +195,17 @@ export class WebCrawler {
|
||||
}
|
||||
|
||||
try {
|
||||
let content;
|
||||
// If it is the first link, fetch with scrapingbee
|
||||
let content : string = "";
|
||||
// If it is the first link, fetch with single url
|
||||
if (this.visited.size === 1) {
|
||||
content = await scrapWithScrapingBee(url, "load");
|
||||
const page = await scrapSingleUrl(url, {includeHtml: true});
|
||||
content = page.html ?? ""
|
||||
} else {
|
||||
const response = await axios.get(url);
|
||||
content = response.data;
|
||||
content = response.data ?? "";
|
||||
}
|
||||
const $ = load(content);
|
||||
let links: string[] = [];
|
||||
let links: {url: string, html: string}[] = [];
|
||||
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
@ -215,7 +218,6 @@ export class WebCrawler {
|
||||
const path = url.pathname;
|
||||
|
||||
if (
|
||||
// fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
|
||||
this.isInternalLink(fullUrl) &&
|
||||
this.matchesPattern(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
@ -223,12 +225,14 @@ export class WebCrawler {
|
||||
!this.matchesExcludes(path) &&
|
||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||
) {
|
||||
links.push(fullUrl);
|
||||
links.push({url: fullUrl, html: content});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return links.filter((link) => !this.visited.has(link));
|
||||
// Create a new list to return to avoid modifying the visited list
|
||||
const filteredLinks = links.filter((link) => !this.visited.has(link.url));
|
||||
return filteredLinks;
|
||||
} catch (error) {
|
||||
return [];
|
||||
}
|
||||
@ -309,3 +313,4 @@ export class WebCrawler {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,20 @@ import {
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import cheerio from "cheerio";
|
||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||
const soup = cheerio.load(html);
|
||||
soup("script, style, iframe, noscript, meta, head").remove();
|
||||
if (pageOptions.onlyMainContent) {
|
||||
// remove any other tags that are not in the main content
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
soup(tag).remove();
|
||||
});
|
||||
}
|
||||
return soup.html();
|
||||
};
|
||||
export class WebScraperDataProvider {
|
||||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
@ -35,6 +48,7 @@ export class WebScraperDataProvider {
|
||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||
"gpt-4-turbo";
|
||||
private fastMode: boolean = false;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@ -46,7 +60,8 @@ export class WebScraperDataProvider {
|
||||
|
||||
private async convertUrlsToDocuments(
|
||||
urls: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const totalUrls = urls.length;
|
||||
let processedUrls = 0;
|
||||
@ -56,7 +71,8 @@ export class WebScraperDataProvider {
|
||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||
await Promise.all(
|
||||
batchUrls.map(async (url, index) => {
|
||||
const result = await scrapSingleUrl(url, this.pageOptions);
|
||||
const existingText = allHtmls ? allHtmls[i + index] : "";
|
||||
const result = await scrapSingleUrl(url, this.pageOptions, existingText);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
inProgress({
|
||||
@ -139,13 +155,33 @@ export class WebScraperDataProvider {
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
});
|
||||
let start = Date.now();
|
||||
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
||||
console.log(links.length)
|
||||
let end = Date.now();
|
||||
console.log("Crawl end in seconds ", (end - start) / 1000);
|
||||
const allLinks = links.map((e) => e.url);
|
||||
const allHtmls = links.map((e)=> e.html);
|
||||
console.log("All links", allLinks.length);
|
||||
console.log("All htmls", allHtmls.length);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
return this.returnOnlyUrlsResponse(allLinks , inProgress);
|
||||
}
|
||||
|
||||
|
||||
let fastDocs = []
|
||||
let documents = [];
|
||||
// check if fast mode is enabled and there is html inside the links
|
||||
if (this.fastMode && links.some((link) => link.html)) {
|
||||
console.log("Fast mode enabled");
|
||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||
|
||||
}else{
|
||||
documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls);
|
||||
}
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(
|
||||
@ -187,14 +223,17 @@ export class WebScraperDataProvider {
|
||||
|
||||
private async processLinks(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls);
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
|
||||
|
||||
documents = this.applyPathReplacements(documents);
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
|
||||
@ -238,6 +277,8 @@ export class WebScraperDataProvider {
|
||||
): Promise<Document[]> {
|
||||
await this.setCachedDocuments(documents, links);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = this.filterDocsExcludeInclude(documents);
|
||||
documents = this.filterDepth(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
@ -397,6 +438,7 @@ export class WebScraperDataProvider {
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
this.fastMode = options.crawlerOptions?.fastMode ?? false;
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
@ -106,7 +106,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
|
||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
||||
existingText: string = ""
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
@ -197,8 +198,13 @@ export async function scrapSingleUrl(
|
||||
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
// If exists text coming from crawler, use it
|
||||
if (existingText && existingText.trim().length >= 100) {
|
||||
text = existingText;
|
||||
break;
|
||||
}
|
||||
[text, html] = await attemptScraping(urlToScrap, scraper);
|
||||
if (text && text.length >= 100) break;
|
||||
if (text && text.trim().length >= 100) break;
|
||||
console.log(`Falling back to ${scraper}`);
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,7 @@ getWebScraperQueue().process(
|
||||
success: success,
|
||||
result: {
|
||||
links: docs.map((doc) => {
|
||||
return { content: doc, source: doc.metadata.sourceURL };
|
||||
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
|
Loading…
x
Reference in New Issue
Block a user