0
This commit is contained in:
Nicolas 2024-04-16 12:49:14 -04:00
parent 15fd4e23d8
commit 4c4775e0b8
3 changed files with 86 additions and 2 deletions

View File

@ -18,6 +18,7 @@ export class WebCrawler {
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
private generateImgAltText: boolean;
constructor({ constructor({
initialUrl, initialUrl,
@ -25,12 +26,14 @@ export class WebCrawler {
excludes, excludes,
maxCrawledLinks, maxCrawledLinks,
limit = 10000, limit = 10000,
generateImgAltText = false,
}: { }: {
initialUrl: string; initialUrl: string;
includes?: string[]; includes?: string[];
excludes?: string[]; excludes?: string[];
maxCrawledLinks?: number; maxCrawledLinks?: number;
limit?: number; limit?: number;
generateImgAltText?: boolean;
}) { }) {
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = new URL(initialUrl).origin;
@ -41,8 +44,10 @@ export class WebCrawler {
this.robots = robotsParser(this.robotsTxtUrl, ""); this.robots = robotsParser(this.robotsTxtUrl, "");
// Deprecated, use limit instead // Deprecated, use limit instead
this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledLinks = maxCrawledLinks ?? limit;
this.generateImgAltText = generateImgAltText ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number): string[] { private filterLinks(sitemapLinks: string[], limit: number): string[] {
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {

View File

@ -4,6 +4,7 @@ import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
import { WebCrawler } from "./crawler"; import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis"; import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/gptVision";
export type WebScraperOptions = { export type WebScraperOptions = {
urls: string[]; urls: string[];
@ -14,7 +15,7 @@ export type WebScraperOptions = {
excludes?: string[]; excludes?: string[];
maxCrawledLinks?: number; maxCrawledLinks?: number;
limit?: number; limit?: number;
generateImgAltText?: boolean;
}; };
concurrentRequests?: number; concurrentRequests?: number;
}; };
@ -27,6 +28,7 @@ export class WebScraperDataProvider {
private returnOnlyUrls: boolean; private returnOnlyUrls: boolean;
private limit: number = 10000; private limit: number = 10000;
private concurrentRequests: number = 20; private concurrentRequests: number = 20;
private generateImgAltText: boolean = false;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -80,6 +82,7 @@ export class WebScraperDataProvider {
excludes: this.excludes, excludes: this.excludes,
maxCrawledLinks: this.maxCrawledLinks, maxCrawledLinks: this.maxCrawledLinks,
limit: this.limit, limit: this.limit,
generateImgAltText: this.generateImgAltText,
}); });
const links = await crawler.start(inProgress, 5, this.limit); const links = await crawler.start(inProgress, 5, this.limit);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
@ -93,6 +96,9 @@ export class WebScraperDataProvider {
let documents = await this.convertUrlsToDocuments(links, inProgress); let documents = await this.convertUrlsToDocuments(links, inProgress);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
console.log("documents", documents) console.log("documents", documents)
if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents);
}
// CACHING DOCUMENTS // CACHING DOCUMENTS
// - parent document // - parent document
@ -116,7 +122,9 @@ export class WebScraperDataProvider {
if (this.mode === "single_urls") { if (this.mode === "single_urls") {
let documents = await this.convertUrlsToDocuments(this.urls, inProgress); let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents);
}
const baseUrl = new URL(this.urls[0]).origin; const baseUrl = new URL(this.urls[0]).origin;
documents = await this.getSitemapData(baseUrl, documents); documents = await this.getSitemapData(baseUrl, documents);
@ -130,6 +138,9 @@ export class WebScraperDataProvider {
let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents);
}
await this.setCachedDocuments(documents); await this.setCachedDocuments(documents);
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
@ -244,6 +255,7 @@ export class WebScraperDataProvider {
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
this.limit = options.crawlerOptions?.limit ?? 10000; this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
@ -283,5 +295,31 @@ export class WebScraperDataProvider {
} }
return documents; return documents;
} }
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
await Promise.all(documents.map(async (document) => {
const baseUrl = new URL(document.metadata.sourceURL).origin;
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
await Promise.all(images.map(async (image) => {
let imageUrl = image.match(/\(([^)]+)\)/)[1];
let altText = image.match(/\[(.*?)\]/)[1];
let newImageUrl = '';
if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
newImageUrl = baseUrl + imageUrl;
const imageIndex = document.content.indexOf(image);
const contentLength = document.content.length;
let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
let frontText = document.content.substring(frontTextStartIndex, imageIndex);
altText = await getImageDescription(newImageUrl, backText, frontText);
}
document.content = document.content.replace(image, `![${altText}](${newImageUrl})`);
}));
}));
return documents;
}
} }

View File

@ -0,0 +1,41 @@
export async function getImageDescription(
imageUrl: string,
backText: string,
frontText: string
): Promise<string> {
const { OpenAI } = require("openai");
const openai = new OpenAI();
try {
const response = await openai.chat.completions.create({
model: "gpt-4-turbo",
messages: [
{
role: "user",
content: [
{
type: "text",
text:
"What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
backText +
" and the following text: " +
frontText +
". Be super concise.",
},
{
type: "image_url",
image_url: {
url: imageUrl,
},
},
],
},
],
});
return response.choices[0].message.content;
} catch (error) {
console.error("Error generating image alt text:", error?.message);
return "";
}
}