From 4c4775e0b8dff55068bbf229c0be35786c2ea4c0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 16 Apr 2024 12:49:14 -0400 Subject: [PATCH] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 5 +++ apps/api/src/scraper/WebScraper/index.ts | 42 ++++++++++++++++++- .../src/scraper/WebScraper/utils/gptVision.ts | 41 ++++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/gptVision.ts diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index eb7a2af..886efab 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -18,6 +18,7 @@ export class WebCrawler { private limit: number; private robotsTxtUrl: string; private robots: any; + private generateImgAltText: boolean; constructor({ initialUrl, @@ -25,12 +26,14 @@ export class WebCrawler { excludes, maxCrawledLinks, limit = 10000, + generateImgAltText = false, }: { initialUrl: string; includes?: string[]; excludes?: string[]; maxCrawledLinks?: number; limit?: number; + generateImgAltText?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -41,8 +44,10 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, ""); // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; + this.generateImgAltText = generateImgAltText ?? false; } + private filterLinks(sitemapLinks: string[], limit: number): string[] { return sitemapLinks .filter((link) => { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 6810ea2..b54d9e6 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -4,6 +4,7 @@ import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; +import { getImageDescription } from "./utils/gptVision"; export type WebScraperOptions = { urls: string[]; @@ -14,7 +15,7 @@ export type WebScraperOptions = { excludes?: string[]; maxCrawledLinks?: number; limit?: number; - + generateImgAltText?: boolean; }; concurrentRequests?: number; }; @@ -27,6 +28,7 @@ export class WebScraperDataProvider { private returnOnlyUrls: boolean; private limit: number = 10000; private concurrentRequests: number = 20; + private generateImgAltText: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -80,6 +82,7 @@ export class WebScraperDataProvider { excludes: this.excludes, maxCrawledLinks: this.maxCrawledLinks, limit: this.limit, + generateImgAltText: this.generateImgAltText, }); const links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { @@ -93,6 +96,9 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); console.log("documents", documents) + if (this.generateImgAltText) { + documents = await this.generatesImgAltText(documents); + } // CACHING DOCUMENTS // - parent document @@ -116,7 +122,9 @@ export class WebScraperDataProvider { if (this.mode === "single_urls") { let documents = await this.convertUrlsToDocuments(this.urls, inProgress); - + if (this.generateImgAltText) { + documents = await this.generatesImgAltText(documents); + } const baseUrl = new URL(this.urls[0]).origin; documents = await this.getSitemapData(baseUrl, documents); @@ -130,6 +138,9 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); documents = await this.getSitemapData(this.urls[0], documents); + if (this.generateImgAltText) { + documents = await this.generatesImgAltText(documents); + } await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); @@ -244,6 +255,7 @@ export class WebScraperDataProvider { this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.limit = options.crawlerOptions?.limit ?? 10000; + this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check @@ -283,5 +295,31 @@ export class WebScraperDataProvider { } return documents; } + generatesImgAltText = async (documents: Document[]): Promise => { + await Promise.all(documents.map(async (document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; + + await Promise.all(images.map(async (image) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + let newImageUrl = ''; + + if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { + newImageUrl = baseUrl + imageUrl; + const imageIndex = document.content.indexOf(image); + const contentLength = document.content.length; + let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); + let frontTextStartIndex = Math.max(imageIndex - 1000, 0); + let frontText = document.content.substring(frontTextStartIndex, imageIndex); + altText = await getImageDescription(newImageUrl, backText, frontText); + } + + document.content = document.content.replace(image, `![${altText}](${newImageUrl})`); + })); + })); + + return documents; + } } diff --git a/apps/api/src/scraper/WebScraper/utils/gptVision.ts b/apps/api/src/scraper/WebScraper/utils/gptVision.ts new file mode 100644 index 0000000..7458a56 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/gptVision.ts @@ -0,0 +1,41 @@ +export async function getImageDescription( + imageUrl: string, + backText: string, + frontText: string +): Promise { + const { OpenAI } = require("openai"); + const openai = new OpenAI(); + + try { + const response = await openai.chat.completions.create({ + model: "gpt-4-turbo", + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: + "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " + + backText + + " and the following text: " + + frontText + + ". Be super concise.", + }, + { + type: "image_url", + image_url: { + url: imageUrl, + }, + }, + ], + }, + ], + }); + + return response.choices[0].message.content; + } catch (error) { + console.error("Error generating image alt text:", error?.message); + return ""; + } +}