From a04610302a7c0812183c240a2644d0c81de86597 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 16 Apr 2024 16:31:33 -0300 Subject: [PATCH 1/7] Spliting relative paths for images --- apps/api/src/index.ts | 4 +++ apps/api/src/scraper/WebScraper/index.ts | 37 ++++++++++++++++++------ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 7198988..26fb2a9 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -90,6 +90,7 @@ app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer const team_id = await authenticateUser(req, res, "scrape"); + const crawlerOptions = req.body.crawlerOptions ?? {}; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -113,6 +114,9 @@ app.post("/v0/scrape", async (req, res) => { await a.setOptions({ mode: "single_urls", urls: [url], + crawlerOptions: { + ...crawlerOptions, + }, }); const docs = await a.getDocuments(false); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index b54d9e6..8290762 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -74,7 +74,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (!useCaching) { + if (true) {//!useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], @@ -95,7 +95,7 @@ export class WebScraperDataProvider { } let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); - console.log("documents", documents) + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -122,6 +122,7 @@ export class WebScraperDataProvider { if (this.mode === "single_urls") { let documents = await this.convertUrlsToDocuments(this.urls, inProgress); + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -138,6 +139,7 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); documents = await this.getSitemapData(this.urls[0], documents); + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -297,29 +299,46 @@ export class WebScraperDataProvider { } generatesImgAltText = async (documents: Document[]): Promise => { await Promise.all(documents.map(async (document) => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; + const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; - await Promise.all(images.map(async (image) => { + await Promise.all(images.map(async (image: string) => { let imageUrl = image.match(/\(([^)]+)\)/)[1]; let altText = image.match(/\[(.*?)\]/)[1]; - let newImageUrl = ''; if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { - newImageUrl = baseUrl + imageUrl; const imageIndex = document.content.indexOf(image); const contentLength = document.content.length; let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); let frontTextStartIndex = Math.max(imageIndex - 1000, 0); let frontText = document.content.substring(frontTextStartIndex, imageIndex); - altText = await getImageDescription(newImageUrl, backText, frontText); + altText = await getImageDescription(imageUrl, backText, frontText); } - document.content = document.content.replace(image, `![${altText}](${newImageUrl})`); + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); })); })); return documents; } + + replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + documents.forEach(document => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; + + images.forEach(image => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + imageUrl = baseUrl + imageUrl; + } + + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); + }); + }); + + return documents; + } } From d23a7ae591fb21c28ec303bf160c6a51bede2635 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 16 Apr 2024 16:34:01 -0300 Subject: [PATCH 2/7] improving relative paths --- apps/api/src/scraper/WebScraper/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 8290762..6f368a1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -74,7 +74,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (true) {//!useCaching) { + if (!useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], From b375ce3e39df3ce0a44bf1778ca389b0fe04bdf2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:54:54 -0300 Subject: [PATCH 3/7] adding unit tests and bugfixing --- .../WebScraper/__tests__/index.test.ts | 97 +++++++++++++++++++ apps/api/src/scraper/WebScraper/index.ts | 15 ++- 2 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/__tests__/index.test.ts diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts new file mode 100644 index 0000000..e060d16 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -0,0 +1,97 @@ +import { WebScraperDataProvider } from '../index'; + +describe('WebScraperDataProvider', () => { + describe('replaceImgPathsWithAbsolutePaths', () => { + it('should replace image paths with absolute paths', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](./another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](https://example.com/another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should handle absolute URLs without modification', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not replace non-image content within the documents', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 6f368a1..727b597 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -325,19 +325,24 @@ export class WebScraperDataProvider { documents.forEach(document => { const baseUrl = new URL(document.metadata.sourceURL).origin; const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; - + images.forEach(image => { let imageUrl = image.match(/\(([^)]+)\)/)[1]; let altText = image.match(/\[(.*?)\]/)[1]; - + if (!imageUrl.startsWith("data:image")) { - imageUrl = baseUrl + imageUrl; + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } } - + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); }); }); - + return documents; } } From 2eb81545fa50e8aee61c855bc00fe3f1625c41e2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 11:04:03 -0700 Subject: [PATCH 4/7] Update index.test.ts --- .../WebScraper/__tests__/index.test.ts | 166 +++++++++++++----- 1 file changed, 120 insertions(+), 46 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts index e060d16..49b3926 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -1,97 +1,171 @@ -import { WebScraperDataProvider } from '../index'; +import { WebScraperDataProvider } from "../index"; -describe('WebScraperDataProvider', () => { - describe('replaceImgPathsWithAbsolutePaths', () => { - it('should replace image paths with absolute paths', () => { +describe("WebScraperDataProvider", () => { + describe("replaceImgPathsWithAbsolutePaths", () => { + it("should replace image paths with absolute paths", () => { const webScraperDataProvider = new WebScraperDataProvider(); const documents = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](./another-image.png)', + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](./another-image.png)", }, { - metadata: { sourceURL: 'https://example.com/data-image' }, - content: '![data image](data:image/png;base64,...)', - } + metadata: { sourceURL: "https://example.com/data-image" }, + content: "![data image](data:image/png;base64,...)", + }, ]; const expectedDocuments = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](https://example.com/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](https://example.com/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](https://example.com/another-image.png)', + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](https://example.com/another-image.png)", }, { - metadata: { sourceURL: 'https://example.com/data-image' }, - content: '![data image](data:image/png;base64,...)', - } + metadata: { sourceURL: "https://example.com/data-image" }, + content: "![data image](data:image/png;base64,...)", + }, ]; - const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); expect(result).toEqual(expectedDocuments); }); - it('should handle absolute URLs without modification', () => { + it("should handle absolute URLs without modification", () => { const webScraperDataProvider = new WebScraperDataProvider(); const documents = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](https://example.com/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](https://example.com/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](http://anotherexample.com/another-image.png)', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "![another alt text](http://anotherexample.com/another-image.png)", + }, ]; const expectedDocuments = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: '![alt text](https://example.com/image.png)', + metadata: { sourceURL: "https://example.com/page" }, + content: "![alt text](https://example.com/image.png)", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: '![another alt text](http://anotherexample.com/another-image.png)', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "![another alt text](http://anotherexample.com/another-image.png)", + }, ]; - const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); expect(result).toEqual(expectedDocuments); }); - it('should not replace non-image content within the documents', () => { + it("should not replace non-image content within the documents", () => { const webScraperDataProvider = new WebScraperDataProvider(); const documents = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).', + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](./another-image.png) Here is some **bold text**.", + }, ]; - + const expectedDocuments = [ { - metadata: { sourceURL: 'https://example.com/page' }, - content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).', + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).", }, { - metadata: { sourceURL: 'https://example.com/another-page' }, - content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.', - } + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.", + }, ]; - - const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + it("should replace multiple image paths within the documents", () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)", + }, + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: "https://example.com/page" }, + content: + "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: + "Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)", + }, + ]; + + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it("should replace image paths within the documents with complex URLs", () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: "https://example.com/page/subpage" }, + content: + "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page/subpage" }, + content: + "Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)", + }, + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: "https://example.com/page/subpage" }, + content: + "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)", + }, + { + metadata: { sourceURL: "https://example.com/another-page/subpage" }, + content: + "Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)", + }, + ]; + + const result = + webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); expect(result).toEqual(expectedDocuments); }); }); -}); \ No newline at end of file +}); From 871d5d91b0fa0d477425bbdc512edcbaa4a33f56 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:51:12 -0700 Subject: [PATCH 5/7] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 284 +++++++++++++++-------- 1 file changed, 190 insertions(+), 94 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 727b597..ecb2fff 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -49,19 +49,21 @@ export class WebScraperDataProvider { const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); - await Promise.all(batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true); - processedUrls++; - if (inProgress) { - inProgress({ - current: processedUrls, - total: totalUrls, - status: "SCRAPING", - currentDocumentUrl: url, - }); - } - results[i + index] = result; - })); + await Promise.all( + batchUrls.map(async (url, index) => { + const result = await scrapSingleUrl(url, true); + processedUrls++; + if (inProgress) { + inProgress({ + current: processedUrls, + total: totalUrls, + status: "SCRAPING", + currentDocumentUrl: url, + }); + } + results[i + index] = result; + }) + ); } return results.filter((result) => result !== null) as Document[]; } @@ -102,33 +104,58 @@ export class WebScraperDataProvider { // CACHING DOCUMENTS // - parent document - const cachedParentDocumentString = await getValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0])); + const cachedParentDocumentString = await getValue( + "web-scraper-cache:" + this.normalizeUrl(this.urls[0]) + ); if (cachedParentDocumentString != null) { let cachedParentDocument = JSON.parse(cachedParentDocumentString); - if (!cachedParentDocument.childrenLinks || cachedParentDocument.childrenLinks.length < links.length - 1) { - cachedParentDocument.childrenLinks = links.filter((link) => link !== this.urls[0]); - await setValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]), JSON.stringify(cachedParentDocument), 60 * 60 * 24 * 10); // 10 days + if ( + !cachedParentDocument.childrenLinks || + cachedParentDocument.childrenLinks.length < links.length - 1 + ) { + cachedParentDocument.childrenLinks = links.filter( + (link) => link !== this.urls[0] + ); + await setValue( + "web-scraper-cache:" + this.normalizeUrl(this.urls[0]), + JSON.stringify(cachedParentDocument), + 60 * 60 * 24 * 10 + ); // 10 days } } else { - let parentDocument = documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) === this.normalizeUrl(this.urls[0])) + let parentDocument = documents.filter( + (document) => + this.normalizeUrl(document.metadata.sourceURL) === + this.normalizeUrl(this.urls[0]) + ); await this.setCachedDocuments(parentDocument, links); } - await this.setCachedDocuments(documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) !== this.normalizeUrl(this.urls[0])), []); + await this.setCachedDocuments( + documents.filter( + (document) => + this.normalizeUrl(document.metadata.sourceURL) !== + this.normalizeUrl(this.urls[0]) + ), + [] + ); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); return documents; } if (this.mode === "single_urls") { - let documents = await this.convertUrlsToDocuments(this.urls, inProgress); + let documents = await this.convertUrlsToDocuments( + this.urls, + inProgress + ); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } const baseUrl = new URL(this.urls[0]).origin; documents = await this.getSitemapData(baseUrl, documents); - + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -136,14 +163,17 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { const links = await getLinksFromSitemap(this.urls[0]); - let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); + let documents = await this.convertUrlsToDocuments( + links.slice(0, this.limit), + inProgress + ); documents = await this.getSitemapData(this.urls[0], documents); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } - + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -153,11 +183,22 @@ export class WebScraperDataProvider { return []; } - let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit)); + let documents = await this.getCachedDocuments( + this.urls.slice(0, this.limit) + ); if (documents.length < this.limit) { - const newDocuments: Document[] = await this.getDocuments(false, inProgress); - newDocuments.forEach(doc => { - if (!documents.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) { + const newDocuments: Document[] = await this.getDocuments( + false, + inProgress + ); + newDocuments.forEach((doc) => { + if ( + !documents.some( + (d) => + this.normalizeUrl(d.metadata.sourceURL) === + this.normalizeUrl(doc.metadata?.sourceURL) + ) + ) { documents.push(doc); } }); @@ -173,17 +214,23 @@ export class WebScraperDataProvider { const url = new URL(document.metadata.sourceURL); const path = url.pathname; - if (this.excludes.length > 0 && this.excludes[0] !== '') { + if (this.excludes.length > 0 && this.excludes[0] !== "") { // Check if the link should be excluded - if (this.excludes.some(excludePattern => new RegExp(excludePattern).test(path))) { + if ( + this.excludes.some((excludePattern) => + new RegExp(excludePattern).test(path) + ) + ) { return false; } } - - if (this.includes.length > 0 && this.includes[0] !== '') { + + if (this.includes.length > 0 && this.includes[0] !== "") { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0) { - return this.includes.some(includePattern => new RegExp(includePattern).test(path)); + return this.includes.some((includePattern) => + new RegExp(includePattern).test(path) + ); } } return true; @@ -200,7 +247,7 @@ export class WebScraperDataProvider { private removeChildLinks(documents: Document[]): Document[] { for (let document of documents) { if (document?.childrenLinks) delete document.childrenLinks; - }; + } return documents; } @@ -210,10 +257,14 @@ export class WebScraperDataProvider { continue; } const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL); - await setValue('web-scraper-cache:' + normalizedUrl, JSON.stringify({ - ...document, - childrenLinks: childrenLinks || [] - }), 60 * 60 * 24 * 10); // 10 days + await setValue( + "web-scraper-cache:" + normalizedUrl, + JSON.stringify({ + ...document, + childrenLinks: childrenLinks || [], + }), + 60 * 60 * 24 * 10 + ); // 10 days } } @@ -221,8 +272,12 @@ export class WebScraperDataProvider { let documents: Document[] = []; for (const url of urls) { const normalizedUrl = this.normalizeUrl(url); - console.log("Getting cached document for web-scraper-cache:" + normalizedUrl) - const cachedDocumentString = await getValue('web-scraper-cache:' + normalizedUrl); + console.log( + "Getting cached document for web-scraper-cache:" + normalizedUrl + ); + const cachedDocumentString = await getValue( + "web-scraper-cache:" + normalizedUrl + ); if (cachedDocumentString) { const cachedDocument = JSON.parse(cachedDocumentString); documents.push(cachedDocument); @@ -230,10 +285,18 @@ export class WebScraperDataProvider { // get children documents for (const childUrl of cachedDocument.childrenLinks) { const normalizedChildUrl = this.normalizeUrl(childUrl); - const childCachedDocumentString = await getValue('web-scraper-cache:' + normalizedChildUrl); + const childCachedDocumentString = await getValue( + "web-scraper-cache:" + normalizedChildUrl + ); if (childCachedDocumentString) { const childCachedDocument = JSON.parse(childCachedDocumentString); - if (!documents.find((doc) => doc.metadata.sourceURL === childCachedDocument.metadata.sourceURL)) { + if ( + !documents.find( + (doc) => + doc.metadata.sourceURL === + childCachedDocument.metadata.sourceURL + ) + ) { documents.push(childCachedDocument); } } @@ -248,7 +311,7 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } - console.log("options", options.crawlerOptions?.excludes) + console.log("options", options.crawlerOptions?.excludes); this.urls = options.urls; this.mode = options.mode; this.concurrentRequests = options.concurrentRequests ?? 20; @@ -257,13 +320,12 @@ export class WebScraperDataProvider { this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.limit = options.crawlerOptions?.limit ?? 10000; - this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - + this.generateImgAltText = + options.crawlerOptions?.generateImgAltText ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check - this.excludes = this.excludes.filter(item => item !== ''); - - + this.excludes = this.excludes.filter((item) => item !== ""); + // make sure all urls start with https:// this.urls = this.urls.map((url) => { if (!url.trim().startsWith("http")) { @@ -274,10 +336,14 @@ export class WebScraperDataProvider { } private async getSitemapData(baseUrl: string, documents: Document[]) { - const sitemapData = await fetchSitemapData(baseUrl) + const sitemapData = await fetchSitemapData(baseUrl); if (sitemapData) { for (let i = 0; i < documents.length; i++) { - const docInSitemapData = sitemapData.find((data) => this.normalizeUrl(data.loc) === this.normalizeUrl(documents[i].metadata.sourceURL)) + const docInSitemapData = sitemapData.find( + (data) => + this.normalizeUrl(data.loc) === + this.normalizeUrl(documents[i].metadata.sourceURL) + ); if (docInSitemapData) { let sitemapDocData: Partial = {}; if (docInSitemapData.changefreq) { @@ -298,52 +364,82 @@ export class WebScraperDataProvider { return documents; } generatesImgAltText = async (documents: Document[]): Promise => { - await Promise.all(documents.map(async (document) => { - const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; + await Promise.all( + documents.map(async (document) => { + const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; - await Promise.all(images.map(async (image: string) => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; + await Promise.all( + images.map(async (image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; - if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { - const imageIndex = document.content.indexOf(image); - const contentLength = document.content.length; - let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); - let frontTextStartIndex = Math.max(imageIndex - 1000, 0); - let frontText = document.content.substring(frontTextStartIndex, imageIndex); - altText = await getImageDescription(imageUrl, backText, frontText); - } - - document.content = document.content.replace(image, `![${altText}](${imageUrl})`); - })); - })); - - return documents; - } - - replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { - documents.forEach(document => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; - - images.forEach(image => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; - - if (!imageUrl.startsWith("data:image")) { - if (!imageUrl.startsWith("http")) { - if (imageUrl.startsWith("/")) { - imageUrl = imageUrl.substring(1); + if ( + !altText && + !imageUrl.startsWith("data:image") && + /\.(png|jpeg|gif|webp)$/.test(imageUrl) + ) { + const imageIndex = document.content.indexOf(image); + const contentLength = document.content.length; + let backText = document.content.substring( + imageIndex + image.length, + Math.min(imageIndex + image.length + 1000, contentLength) + ); + let frontTextStartIndex = Math.max(imageIndex - 1000, 0); + let frontText = document.content.substring( + frontTextStartIndex, + imageIndex + ); + altText = await getImageDescription( + imageUrl, + backText, + frontText + ); } - imageUrl = new URL(imageUrl, baseUrl).toString(); - } - } - - document.content = document.content.replace(image, `![${altText}](${imageUrl})`); - }); - }); - - return documents; - } -} + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }) + ); + }) + ); + + return documents; + }; + + replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + try { + documents.forEach((document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = + document.content.match( + /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g + ) || []; + + images.forEach((image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } + } + + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }); + }); + + return documents; + } catch (error) { + return documents; + } + }; +} From de439f6529111b2f839dc8a8ef126310e1a0b31d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:51:29 -0700 Subject: [PATCH 6/7] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index ecb2fff..501dde0 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -439,6 +439,7 @@ export class WebScraperDataProvider { return documents; } catch (error) { + console.error("Error replacing img paths with absolute paths", error); return documents; } }; From 52fb28bc1a943d6489f85fb93061f8c01bf6c0f1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:52:15 -0700 Subject: [PATCH 7/7] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 501dde0..e1bd425 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -311,7 +311,6 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } - console.log("options", options.crawlerOptions?.excludes); this.urls = options.urls; this.mode = options.mode; this.concurrentRequests = options.concurrentRequests ?? 20;