Nick:
This commit is contained in:
parent
15fd4e23d8
commit
4c4775e0b8
@ -18,6 +18,7 @@ export class WebCrawler {
|
|||||||
private limit: number;
|
private limit: number;
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
private robots: any;
|
private robots: any;
|
||||||
|
private generateImgAltText: boolean;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
initialUrl,
|
initialUrl,
|
||||||
@ -25,12 +26,14 @@ export class WebCrawler {
|
|||||||
excludes,
|
excludes,
|
||||||
maxCrawledLinks,
|
maxCrawledLinks,
|
||||||
limit = 10000,
|
limit = 10000,
|
||||||
|
generateImgAltText = false,
|
||||||
}: {
|
}: {
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
maxCrawledLinks?: number;
|
maxCrawledLinks?: number;
|
||||||
limit?: number;
|
limit?: number;
|
||||||
|
generateImgAltText?: boolean;
|
||||||
}) {
|
}) {
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
this.baseUrl = new URL(initialUrl).origin;
|
this.baseUrl = new URL(initialUrl).origin;
|
||||||
@ -41,8 +44,10 @@ export class WebCrawler {
|
|||||||
this.robots = robotsParser(this.robotsTxtUrl, "");
|
this.robots = robotsParser(this.robotsTxtUrl, "");
|
||||||
// Deprecated, use limit instead
|
// Deprecated, use limit instead
|
||||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||||
|
this.generateImgAltText = generateImgAltText ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private filterLinks(sitemapLinks: string[], limit: number): string[] {
|
private filterLinks(sitemapLinks: string[], limit: number): string[] {
|
||||||
return sitemapLinks
|
return sitemapLinks
|
||||||
.filter((link) => {
|
.filter((link) => {
|
||||||
|
@ -4,6 +4,7 @@ import { scrapSingleUrl } from "./single_url";
|
|||||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
import { getValue, setValue } from "../../services/redis";
|
import { getValue, setValue } from "../../services/redis";
|
||||||
|
import { getImageDescription } from "./utils/gptVision";
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type WebScraperOptions = {
|
||||||
urls: string[];
|
urls: string[];
|
||||||
@ -14,7 +15,7 @@ export type WebScraperOptions = {
|
|||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
maxCrawledLinks?: number;
|
maxCrawledLinks?: number;
|
||||||
limit?: number;
|
limit?: number;
|
||||||
|
generateImgAltText?: boolean;
|
||||||
};
|
};
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
};
|
};
|
||||||
@ -27,6 +28,7 @@ export class WebScraperDataProvider {
|
|||||||
private returnOnlyUrls: boolean;
|
private returnOnlyUrls: boolean;
|
||||||
private limit: number = 10000;
|
private limit: number = 10000;
|
||||||
private concurrentRequests: number = 20;
|
private concurrentRequests: number = 20;
|
||||||
|
private generateImgAltText: boolean = false;
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -80,6 +82,7 @@ export class WebScraperDataProvider {
|
|||||||
excludes: this.excludes,
|
excludes: this.excludes,
|
||||||
maxCrawledLinks: this.maxCrawledLinks,
|
maxCrawledLinks: this.maxCrawledLinks,
|
||||||
limit: this.limit,
|
limit: this.limit,
|
||||||
|
generateImgAltText: this.generateImgAltText,
|
||||||
});
|
});
|
||||||
const links = await crawler.start(inProgress, 5, this.limit);
|
const links = await crawler.start(inProgress, 5, this.limit);
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
@ -93,6 +96,9 @@ export class WebScraperDataProvider {
|
|||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
console.log("documents", documents)
|
console.log("documents", documents)
|
||||||
|
if (this.generateImgAltText) {
|
||||||
|
documents = await this.generatesImgAltText(documents);
|
||||||
|
}
|
||||||
|
|
||||||
// CACHING DOCUMENTS
|
// CACHING DOCUMENTS
|
||||||
// - parent document
|
// - parent document
|
||||||
@ -116,7 +122,9 @@ export class WebScraperDataProvider {
|
|||||||
|
|
||||||
if (this.mode === "single_urls") {
|
if (this.mode === "single_urls") {
|
||||||
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
||||||
|
if (this.generateImgAltText) {
|
||||||
|
documents = await this.generatesImgAltText(documents);
|
||||||
|
}
|
||||||
const baseUrl = new URL(this.urls[0]).origin;
|
const baseUrl = new URL(this.urls[0]).origin;
|
||||||
documents = await this.getSitemapData(baseUrl, documents);
|
documents = await this.getSitemapData(baseUrl, documents);
|
||||||
|
|
||||||
@ -130,6 +138,9 @@ export class WebScraperDataProvider {
|
|||||||
let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
|
let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
|
||||||
|
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
if (this.generateImgAltText) {
|
||||||
|
documents = await this.generatesImgAltText(documents);
|
||||||
|
}
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
await this.setCachedDocuments(documents);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
@ -244,6 +255,7 @@ export class WebScraperDataProvider {
|
|||||||
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
||||||
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
|
this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
|
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
@ -283,5 +295,31 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
||||||
|
await Promise.all(documents.map(async (document) => {
|
||||||
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
|
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
|
||||||
|
|
||||||
|
await Promise.all(images.map(async (image) => {
|
||||||
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||||
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
|
let newImageUrl = '';
|
||||||
|
|
||||||
|
if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
|
||||||
|
newImageUrl = baseUrl + imageUrl;
|
||||||
|
const imageIndex = document.content.indexOf(image);
|
||||||
|
const contentLength = document.content.length;
|
||||||
|
let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
|
||||||
|
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
||||||
|
let frontText = document.content.substring(frontTextStartIndex, imageIndex);
|
||||||
|
altText = await getImageDescription(newImageUrl, backText, frontText);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.content = document.content.replace(image, `![${altText}](${newImageUrl})`);
|
||||||
|
}));
|
||||||
|
}));
|
||||||
|
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
41
apps/api/src/scraper/WebScraper/utils/gptVision.ts
Normal file
41
apps/api/src/scraper/WebScraper/utils/gptVision.ts
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
export async function getImageDescription(
|
||||||
|
imageUrl: string,
|
||||||
|
backText: string,
|
||||||
|
frontText: string
|
||||||
|
): Promise<string> {
|
||||||
|
const { OpenAI } = require("openai");
|
||||||
|
const openai = new OpenAI();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await openai.chat.completions.create({
|
||||||
|
model: "gpt-4-turbo",
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: [
|
||||||
|
{
|
||||||
|
type: "text",
|
||||||
|
text:
|
||||||
|
"What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
|
||||||
|
backText +
|
||||||
|
" and the following text: " +
|
||||||
|
frontText +
|
||||||
|
". Be super concise.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: "image_url",
|
||||||
|
image_url: {
|
||||||
|
url: imageUrl,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
return response.choices[0].message.content;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error generating image alt text:", error?.message);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user