0
v-firecrawl/apps/api/src/scraper/WebScraper/index.ts

468 lines
16 KiB
TypeScript
Raw Normal View History

2024-04-17 21:24:46 -04:00
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
2024-04-15 17:01:47 -04:00
import { Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis";
2024-04-16 12:49:14 -04:00
import { getImageDescription } from "./utils/gptVision";
2024-04-19 16:05:21 -04:00
import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
2024-04-15 17:01:47 -04:00
2024-04-17 21:24:46 -04:00
2024-04-15 17:01:47 -04:00
export class WebScraperDataProvider {
private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
private includes: string[];
private excludes: string[];
private maxCrawledLinks: number;
private returnOnlyUrls: boolean;
private limit: number = 10000;
private concurrentRequests: number = 20;
2024-04-16 12:49:14 -04:00
private generateImgAltText: boolean = false;
2024-04-17 21:24:46 -04:00
private pageOptions?: PageOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false;
2024-04-15 17:01:47 -04:00
authorize(): void {
throw new Error("Method not implemented.");
}
authorizeNango(): Promise<void> {
throw new Error("Method not implemented.");
}
private async convertUrlsToDocuments(
urls: string[],
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
const totalUrls = urls.length;
let processedUrls = 0;
console.log("Converting urls to documents");
console.log("Total urls", urls);
const results: (Document | null)[] = new Array(urls.length).fill(null);
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
const batchUrls = urls.slice(i, i + this.concurrentRequests);
2024-04-17 15:51:12 -04:00
await Promise.all(
batchUrls.map(async (url, index) => {
2024-04-17 21:24:46 -04:00
const result = await scrapSingleUrl(url, true, this.pageOptions);
2024-04-17 15:51:12 -04:00
processedUrls++;
if (inProgress) {
inProgress({
current: processedUrls,
total: totalUrls,
status: "SCRAPING",
currentDocumentUrl: url,
});
}
results[i + index] = result;
})
);
2024-04-15 17:01:47 -04:00
}
return results.filter((result) => result !== null) as Document[];
}
async getDocuments(
useCaching: boolean = false,
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
if (this.urls[0].trim() === "") {
throw new Error("Url is required");
}
2024-04-18 12:53:11 -04:00
if (!useCaching) {
2024-04-15 17:01:47 -04:00
if (this.mode === "crawl") {
const crawler = new WebCrawler({
initialUrl: this.urls[0],
includes: this.includes,
excludes: this.excludes,
maxCrawledLinks: this.maxCrawledLinks,
limit: this.limit,
2024-04-16 12:49:14 -04:00
generateImgAltText: this.generateImgAltText,
2024-04-15 17:01:47 -04:00
});
2024-04-18 10:43:57 -04:00
let links = await crawler.start(inProgress, 5, this.limit);
2024-04-15 17:01:47 -04:00
if (this.returnOnlyUrls) {
return links.map((url) => ({
content: "",
metadata: { sourceURL: url },
provider: "web",
type: "text",
}));
}
2024-04-18 10:43:57 -04:00
2024-04-19 18:36:00 -04:00
let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
2024-04-18 10:43:57 -04:00
let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink);
pdfDocuments.push({
content: pdfContent,
metadata: { sourceURL: pdfLink },
2024-04-18 13:12:39 -04:00
provider: "web-scraper"
2024-04-18 10:43:57 -04:00
});
}
2024-04-19 18:36:00 -04:00
links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
2024-04-18 10:43:57 -04:00
2024-04-15 17:01:47 -04:00
let documents = await this.convertUrlsToDocuments(links, inProgress);
documents = await this.getSitemapData(this.urls[0], documents);
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
} else {
documents = replaceImgPathsWithAbsolutePaths(documents);
}
2024-04-16 12:49:14 -04:00
if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents);
}
2024-04-18 10:43:57 -04:00
documents = documents.concat(pdfDocuments);
2024-04-15 17:01:47 -04:00
// CACHING DOCUMENTS
// - parent document
2024-04-17 15:51:12 -04:00
const cachedParentDocumentString = await getValue(
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
);
2024-04-15 17:01:47 -04:00
if (cachedParentDocumentString != null) {
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
2024-04-17 15:51:12 -04:00
if (
!cachedParentDocument.childrenLinks ||
cachedParentDocument.childrenLinks.length < links.length - 1
) {
cachedParentDocument.childrenLinks = links.filter(
(link) => link !== this.urls[0]
);
await setValue(
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
JSON.stringify(cachedParentDocument),
60 * 60 * 24 * 10
); // 10 days
2024-04-15 17:01:47 -04:00
}
} else {
2024-04-17 15:51:12 -04:00
let parentDocument = documents.filter(
(document) =>
this.normalizeUrl(document.metadata.sourceURL) ===
this.normalizeUrl(this.urls[0])
);
2024-04-15 17:01:47 -04:00
await this.setCachedDocuments(parentDocument, links);
}
2024-04-17 15:51:12 -04:00
await this.setCachedDocuments(
documents.filter(
(document) =>
this.normalizeUrl(document.metadata.sourceURL) !==
this.normalizeUrl(this.urls[0])
),
[]
);
2024-04-15 17:01:47 -04:00
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
return documents;
}
if (this.mode === "single_urls") {
2024-04-19 18:36:00 -04:00
console.log("Single urls mode");
2024-04-18 10:43:57 -04:00
let pdfDocuments: Document[] = [];
2024-04-19 18:13:17 -04:00
let nonPdfUrls: string[] = [];
for (let url of this.urls) {
2024-04-19 18:36:00 -04:00
console.log("Checking if url is a pdf", url);
if (await isUrlAPdf({url: url, fastMode: false})) {
2024-04-19 18:13:17 -04:00
const pdfContent = await fetchAndProcessPdf(url);
pdfDocuments.push({
content: pdfContent,
metadata: { sourceURL: url },
provider: "web-scraper"
});
} else {
nonPdfUrls.push(url);
2024-04-19 18:36:00 -04:00
console.log("Fetching and processing url", url);
2024-04-19 18:13:17 -04:00
}
2024-04-18 10:43:57 -04:00
}
2024-04-17 15:51:12 -04:00
let documents = await this.convertUrlsToDocuments(
2024-04-19 18:13:17 -04:00
nonPdfUrls,
2024-04-17 15:51:12 -04:00
inProgress
);
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
} else {
documents = replaceImgPathsWithAbsolutePaths(documents);
}
2024-04-16 12:49:14 -04:00
if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents);
}
2024-04-15 17:01:47 -04:00
const baseUrl = new URL(this.urls[0]).origin;
documents = await this.getSitemapData(baseUrl, documents);
2024-04-18 10:43:57 -04:00
documents = documents.concat(pdfDocuments);
2024-04-17 15:51:12 -04:00
2024-04-15 17:01:47 -04:00
await this.setCachedDocuments(documents);
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
return documents;
}
if (this.mode === "sitemap") {
2024-04-18 10:43:57 -04:00
let links = await getLinksFromSitemap(this.urls[0]);
2024-04-19 18:36:00 -04:00
let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
2024-04-18 10:43:57 -04:00
let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink);
pdfDocuments.push({
content: pdfContent,
metadata: { sourceURL: pdfLink },
2024-04-18 13:12:39 -04:00
provider: "web-scraper"
2024-04-18 10:43:57 -04:00
});
}
2024-04-19 18:36:00 -04:00
links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
2024-04-18 10:43:57 -04:00
2024-04-17 15:51:12 -04:00
let documents = await this.convertUrlsToDocuments(
links.slice(0, this.limit),
inProgress
);
2024-04-15 17:01:47 -04:00
documents = await this.getSitemapData(this.urls[0], documents);
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
} else {
documents = replaceImgPathsWithAbsolutePaths(documents);
}
2024-04-16 12:49:14 -04:00
if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents);
}
2024-04-18 10:43:57 -04:00
documents = documents.concat(pdfDocuments);
2024-04-17 15:51:12 -04:00
2024-04-15 17:01:47 -04:00
await this.setCachedDocuments(documents);
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
return documents;
}
return [];
}
2024-04-17 15:51:12 -04:00
let documents = await this.getCachedDocuments(
this.urls.slice(0, this.limit)
);
2024-04-15 17:01:47 -04:00
if (documents.length < this.limit) {
2024-04-17 15:51:12 -04:00
const newDocuments: Document[] = await this.getDocuments(
false,
inProgress
);
newDocuments.forEach((doc) => {
if (
!documents.some(
(d) =>
this.normalizeUrl(d.metadata.sourceURL) ===
this.normalizeUrl(doc.metadata?.sourceURL)
)
) {
2024-04-15 17:01:47 -04:00
documents.push(doc);
}
});
}
documents = this.filterDocsExcludeInclude(documents);
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
return documents;
}
private filterDocsExcludeInclude(documents: Document[]): Document[] {
return documents.filter((document) => {
const url = new URL(document.metadata.sourceURL);
const path = url.pathname;
2024-04-17 15:51:12 -04:00
if (this.excludes.length > 0 && this.excludes[0] !== "") {
2024-04-15 17:01:47 -04:00
// Check if the link should be excluded
2024-04-17 15:51:12 -04:00
if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path)
)
) {
2024-04-15 17:01:47 -04:00
return false;
}
}
2024-04-17 15:51:12 -04:00
if (this.includes.length > 0 && this.includes[0] !== "") {
2024-04-15 17:01:47 -04:00
// Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0) {
2024-04-17 15:51:12 -04:00
return this.includes.some((includePattern) =>
new RegExp(includePattern).test(path)
);
2024-04-15 17:01:47 -04:00
}
}
return true;
});
}
private normalizeUrl(url: string): string {
if (url.includes("//www.")) {
return url.replace("//www.", "//");
}
return url;
}
private removeChildLinks(documents: Document[]): Document[] {
for (let document of documents) {
if (document?.childrenLinks) delete document.childrenLinks;
2024-04-17 15:51:12 -04:00
}
2024-04-15 17:01:47 -04:00
return documents;
}
async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
for (const document of documents) {
if (document.content.trim().length === 0) {
continue;
}
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
2024-04-17 15:51:12 -04:00
await setValue(
"web-scraper-cache:" + normalizedUrl,
JSON.stringify({
...document,
childrenLinks: childrenLinks || [],
}),
60 * 60 * 24 * 10
); // 10 days
2024-04-15 17:01:47 -04:00
}
}
async getCachedDocuments(urls: string[]): Promise<Document[]> {
let documents: Document[] = [];
for (const url of urls) {
const normalizedUrl = this.normalizeUrl(url);
2024-04-17 15:51:12 -04:00
console.log(
"Getting cached document for web-scraper-cache:" + normalizedUrl
);
const cachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedUrl
);
2024-04-15 17:01:47 -04:00
if (cachedDocumentString) {
const cachedDocument = JSON.parse(cachedDocumentString);
documents.push(cachedDocument);
// get children documents
for (const childUrl of cachedDocument.childrenLinks) {
const normalizedChildUrl = this.normalizeUrl(childUrl);
2024-04-17 15:51:12 -04:00
const childCachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedChildUrl
);
2024-04-15 17:01:47 -04:00
if (childCachedDocumentString) {
const childCachedDocument = JSON.parse(childCachedDocumentString);
2024-04-17 15:51:12 -04:00
if (
!documents.find(
(doc) =>
doc.metadata.sourceURL ===
childCachedDocument.metadata.sourceURL
)
) {
2024-04-15 17:01:47 -04:00
documents.push(childCachedDocument);
}
}
}
}
}
return documents;
}
setOptions(options: WebScraperOptions): void {
if (!options.urls) {
throw new Error("Urls are required");
}
this.urls = options.urls;
this.mode = options.mode;
this.concurrentRequests = options.concurrentRequests ?? 20;
this.includes = options.crawlerOptions?.includes ?? [];
this.excludes = options.crawlerOptions?.excludes ?? [];
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
this.limit = options.crawlerOptions?.limit ?? 10000;
2024-04-17 15:51:12 -04:00
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
2024-04-17 21:24:46 -04:00
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
2024-04-15 17:01:47 -04:00
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
2024-04-17 15:51:12 -04:00
this.excludes = this.excludes.filter((item) => item !== "");
2024-04-15 17:01:47 -04:00
// make sure all urls start with https://
this.urls = this.urls.map((url) => {
if (!url.trim().startsWith("http")) {
return `https://${url}`;
}
return url;
});
}
private async getSitemapData(baseUrl: string, documents: Document[]) {
2024-04-17 15:51:12 -04:00
const sitemapData = await fetchSitemapData(baseUrl);
2024-04-15 17:01:47 -04:00
if (sitemapData) {
for (let i = 0; i < documents.length; i++) {
2024-04-17 15:51:12 -04:00
const docInSitemapData = sitemapData.find(
(data) =>
this.normalizeUrl(data.loc) ===
this.normalizeUrl(documents[i].metadata.sourceURL)
);
2024-04-15 17:01:47 -04:00
if (docInSitemapData) {
let sitemapDocData: Partial<SitemapEntry> = {};
if (docInSitemapData.changefreq) {
sitemapDocData.changefreq = docInSitemapData.changefreq;
}
if (docInSitemapData.priority) {
sitemapDocData.priority = Number(docInSitemapData.priority);
}
if (docInSitemapData.lastmod) {
sitemapDocData.lastmod = docInSitemapData.lastmod;
}
if (Object.keys(sitemapDocData).length !== 0) {
documents[i].metadata.sitemap = sitemapDocData;
}
}
}
}
return documents;
}
2024-04-16 12:49:14 -04:00
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
2024-04-17 15:51:12 -04:00
await Promise.all(
documents.map(async (document) => {
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
2024-04-16 12:49:14 -04:00
2024-04-17 15:51:12 -04:00
await Promise.all(
images.map(async (image: string) => {
let imageUrl = image.match(/\(([^)]+)\)/)[1];
let altText = image.match(/\[(.*?)\]/)[1];
if (
!altText &&
!imageUrl.startsWith("data:image") &&
/\.(png|jpeg|gif|webp)$/.test(imageUrl)
) {
const imageIndex = document.content.indexOf(image);
const contentLength = document.content.length;
let backText = document.content.substring(
imageIndex + image.length,
Math.min(imageIndex + image.length + 1000, contentLength)
);
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
let frontText = document.content.substring(
frontTextStartIndex,
imageIndex
);
altText = await getImageDescription(
imageUrl,
backText,
frontText
);
}
document.content = document.content.replace(
image,
`![${altText}](${imageUrl})`
);
})
);
})
);
2024-04-16 12:49:14 -04:00
return documents;
2024-04-17 15:51:12 -04:00
};
}