0

Merge pull request #4 from mendableai/feat/improving-reative-paths

[Feat] improving reative paths
This commit is contained in:
Nicolas 2024-04-17 15:52:24 -04:00 committed by GitHub
commit d48027675c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 366 additions and 71 deletions

View File

@ -91,6 +91,7 @@ app.post("/v0/scrape", async (req, res) => {
if (!success) { if (!success) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {};
try { try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@ -114,6 +115,9 @@ app.post("/v0/scrape", async (req, res) => {
await a.setOptions({ await a.setOptions({
mode: "single_urls", mode: "single_urls",
urls: [url], urls: [url],
crawlerOptions: {
...crawlerOptions,
},
}); });
const docs = await a.getDocuments(false); const docs = await a.getDocuments(false);

View File

@ -0,0 +1,171 @@
import { WebScraperDataProvider } from "../index";
describe("WebScraperDataProvider", () => {
describe("replaceImgPathsWithAbsolutePaths", () => {
it("should replace image paths with absolute paths", () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: "https://example.com/page" },
content: "![alt text](/image.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content: "![another alt text](./another-image.png)",
},
{
metadata: { sourceURL: "https://example.com/data-image" },
content: "![data image](data:image/png;base64,...)",
},
];
const expectedDocuments = [
{
metadata: { sourceURL: "https://example.com/page" },
content: "![alt text](https://example.com/image.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content: "![another alt text](https://example.com/another-image.png)",
},
{
metadata: { sourceURL: "https://example.com/data-image" },
content: "![data image](data:image/png;base64,...)",
},
];
const result =
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it("should handle absolute URLs without modification", () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: "https://example.com/page" },
content: "![alt text](https://example.com/image.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content:
"![another alt text](http://anotherexample.com/another-image.png)",
},
];
const expectedDocuments = [
{
metadata: { sourceURL: "https://example.com/page" },
content: "![alt text](https://example.com/image.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content:
"![another alt text](http://anotherexample.com/another-image.png)",
},
];
const result =
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it("should not replace non-image content within the documents", () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: "https://example.com/page" },
content:
"This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content:
"Another test. ![another alt text](./another-image.png) Here is some **bold text**.",
},
];
const expectedDocuments = [
{
metadata: { sourceURL: "https://example.com/page" },
content:
"This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content:
"Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.",
},
];
const result =
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it("should replace multiple image paths within the documents", () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: "https://example.com/page" },
content:
"This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content:
"Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)",
},
];
const expectedDocuments = [
{
metadata: { sourceURL: "https://example.com/page" },
content:
"This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content:
"Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)",
},
];
const result =
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it("should replace image paths within the documents with complex URLs", () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: "https://example.com/page/subpage" },
content:
"This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page/subpage" },
content:
"Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)",
},
];
const expectedDocuments = [
{
metadata: { sourceURL: "https://example.com/page/subpage" },
content:
"This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page/subpage" },
content:
"Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)",
},
];
const result =
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
});
});

View File

@ -49,19 +49,21 @@ export class WebScraperDataProvider {
const results: (Document | null)[] = new Array(urls.length).fill(null); const results: (Document | null)[] = new Array(urls.length).fill(null);
for (let i = 0; i < urls.length; i += this.concurrentRequests) { for (let i = 0; i < urls.length; i += this.concurrentRequests) {
const batchUrls = urls.slice(i, i + this.concurrentRequests); const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all(batchUrls.map(async (url, index) => { await Promise.all(
const result = await scrapSingleUrl(url, true); batchUrls.map(async (url, index) => {
processedUrls++; const result = await scrapSingleUrl(url, true);
if (inProgress) { processedUrls++;
inProgress({ if (inProgress) {
current: processedUrls, inProgress({
total: totalUrls, current: processedUrls,
status: "SCRAPING", total: totalUrls,
currentDocumentUrl: url, status: "SCRAPING",
}); currentDocumentUrl: url,
} });
results[i + index] = result; }
})); results[i + index] = result;
})
);
} }
return results.filter((result) => result !== null) as Document[]; return results.filter((result) => result !== null) as Document[];
} }
@ -95,33 +97,59 @@ export class WebScraperDataProvider {
} }
let documents = await this.convertUrlsToDocuments(links, inProgress); let documents = await this.convertUrlsToDocuments(links, inProgress);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
console.log("documents", documents) documents = this.replaceImgPathsWithAbsolutePaths(documents);
if (this.generateImgAltText) { if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents); documents = await this.generatesImgAltText(documents);
} }
// CACHING DOCUMENTS // CACHING DOCUMENTS
// - parent document // - parent document
const cachedParentDocumentString = await getValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0])); const cachedParentDocumentString = await getValue(
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
);
if (cachedParentDocumentString != null) { if (cachedParentDocumentString != null) {
let cachedParentDocument = JSON.parse(cachedParentDocumentString); let cachedParentDocument = JSON.parse(cachedParentDocumentString);
if (!cachedParentDocument.childrenLinks || cachedParentDocument.childrenLinks.length < links.length - 1) { if (
cachedParentDocument.childrenLinks = links.filter((link) => link !== this.urls[0]); !cachedParentDocument.childrenLinks ||
await setValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]), JSON.stringify(cachedParentDocument), 60 * 60 * 24 * 10); // 10 days cachedParentDocument.childrenLinks.length < links.length - 1
) {
cachedParentDocument.childrenLinks = links.filter(
(link) => link !== this.urls[0]
);
await setValue(
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
JSON.stringify(cachedParentDocument),
60 * 60 * 24 * 10
); // 10 days
} }
} else { } else {
let parentDocument = documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) === this.normalizeUrl(this.urls[0])) let parentDocument = documents.filter(
(document) =>
this.normalizeUrl(document.metadata.sourceURL) ===
this.normalizeUrl(this.urls[0])
);
await this.setCachedDocuments(parentDocument, links); await this.setCachedDocuments(parentDocument, links);
} }
await this.setCachedDocuments(documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) !== this.normalizeUrl(this.urls[0])), []); await this.setCachedDocuments(
documents.filter(
(document) =>
this.normalizeUrl(document.metadata.sourceURL) !==
this.normalizeUrl(this.urls[0])
),
[]
);
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit); documents = documents.splice(0, this.limit);
return documents; return documents;
} }
if (this.mode === "single_urls") { if (this.mode === "single_urls") {
let documents = await this.convertUrlsToDocuments(this.urls, inProgress); let documents = await this.convertUrlsToDocuments(
this.urls,
inProgress
);
documents = this.replaceImgPathsWithAbsolutePaths(documents);
if (this.generateImgAltText) { if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents); documents = await this.generatesImgAltText(documents);
} }
@ -135,9 +163,13 @@ export class WebScraperDataProvider {
} }
if (this.mode === "sitemap") { if (this.mode === "sitemap") {
const links = await getLinksFromSitemap(this.urls[0]); const links = await getLinksFromSitemap(this.urls[0]);
let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); let documents = await this.convertUrlsToDocuments(
links.slice(0, this.limit),
inProgress
);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
documents = this.replaceImgPathsWithAbsolutePaths(documents);
if (this.generateImgAltText) { if (this.generateImgAltText) {
documents = await this.generatesImgAltText(documents); documents = await this.generatesImgAltText(documents);
} }
@ -151,11 +183,22 @@ export class WebScraperDataProvider {
return []; return [];
} }
let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit)); let documents = await this.getCachedDocuments(
this.urls.slice(0, this.limit)
);
if (documents.length < this.limit) { if (documents.length < this.limit) {
const newDocuments: Document[] = await this.getDocuments(false, inProgress); const newDocuments: Document[] = await this.getDocuments(
newDocuments.forEach(doc => { false,
if (!documents.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) { inProgress
);
newDocuments.forEach((doc) => {
if (
!documents.some(
(d) =>
this.normalizeUrl(d.metadata.sourceURL) ===
this.normalizeUrl(doc.metadata?.sourceURL)
)
) {
documents.push(doc); documents.push(doc);
} }
}); });
@ -171,17 +214,23 @@ export class WebScraperDataProvider {
const url = new URL(document.metadata.sourceURL); const url = new URL(document.metadata.sourceURL);
const path = url.pathname; const path = url.pathname;
if (this.excludes.length > 0 && this.excludes[0] !== '') { if (this.excludes.length > 0 && this.excludes[0] !== "") {
// Check if the link should be excluded // Check if the link should be excluded
if (this.excludes.some(excludePattern => new RegExp(excludePattern).test(path))) { if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path)
)
) {
return false; return false;
} }
} }
if (this.includes.length > 0 && this.includes[0] !== '') { if (this.includes.length > 0 && this.includes[0] !== "") {
// Check if the link matches the include patterns, if any are specified // Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0) { if (this.includes.length > 0) {
return this.includes.some(includePattern => new RegExp(includePattern).test(path)); return this.includes.some((includePattern) =>
new RegExp(includePattern).test(path)
);
} }
} }
return true; return true;
@ -198,7 +247,7 @@ export class WebScraperDataProvider {
private removeChildLinks(documents: Document[]): Document[] { private removeChildLinks(documents: Document[]): Document[] {
for (let document of documents) { for (let document of documents) {
if (document?.childrenLinks) delete document.childrenLinks; if (document?.childrenLinks) delete document.childrenLinks;
}; }
return documents; return documents;
} }
@ -208,10 +257,14 @@ export class WebScraperDataProvider {
continue; continue;
} }
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL); const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
await setValue('web-scraper-cache:' + normalizedUrl, JSON.stringify({ await setValue(
...document, "web-scraper-cache:" + normalizedUrl,
childrenLinks: childrenLinks || [] JSON.stringify({
}), 60 * 60 * 24 * 10); // 10 days ...document,
childrenLinks: childrenLinks || [],
}),
60 * 60 * 24 * 10
); // 10 days
} }
} }
@ -219,8 +272,12 @@ export class WebScraperDataProvider {
let documents: Document[] = []; let documents: Document[] = [];
for (const url of urls) { for (const url of urls) {
const normalizedUrl = this.normalizeUrl(url); const normalizedUrl = this.normalizeUrl(url);
console.log("Getting cached document for web-scraper-cache:" + normalizedUrl) console.log(
const cachedDocumentString = await getValue('web-scraper-cache:' + normalizedUrl); "Getting cached document for web-scraper-cache:" + normalizedUrl
);
const cachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedUrl
);
if (cachedDocumentString) { if (cachedDocumentString) {
const cachedDocument = JSON.parse(cachedDocumentString); const cachedDocument = JSON.parse(cachedDocumentString);
documents.push(cachedDocument); documents.push(cachedDocument);
@ -228,10 +285,18 @@ export class WebScraperDataProvider {
// get children documents // get children documents
for (const childUrl of cachedDocument.childrenLinks) { for (const childUrl of cachedDocument.childrenLinks) {
const normalizedChildUrl = this.normalizeUrl(childUrl); const normalizedChildUrl = this.normalizeUrl(childUrl);
const childCachedDocumentString = await getValue('web-scraper-cache:' + normalizedChildUrl); const childCachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedChildUrl
);
if (childCachedDocumentString) { if (childCachedDocumentString) {
const childCachedDocument = JSON.parse(childCachedDocumentString); const childCachedDocument = JSON.parse(childCachedDocumentString);
if (!documents.find((doc) => doc.metadata.sourceURL === childCachedDocument.metadata.sourceURL)) { if (
!documents.find(
(doc) =>
doc.metadata.sourceURL ===
childCachedDocument.metadata.sourceURL
)
) {
documents.push(childCachedDocument); documents.push(childCachedDocument);
} }
} }
@ -246,7 +311,6 @@ export class WebScraperDataProvider {
throw new Error("Urls are required"); throw new Error("Urls are required");
} }
console.log("options", options.crawlerOptions?.excludes)
this.urls = options.urls; this.urls = options.urls;
this.mode = options.mode; this.mode = options.mode;
this.concurrentRequests = options.concurrentRequests ?? 20; this.concurrentRequests = options.concurrentRequests ?? 20;
@ -255,12 +319,11 @@ export class WebScraperDataProvider {
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
this.limit = options.crawlerOptions?.limit ?? 10000; this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter(item => item !== ''); this.excludes = this.excludes.filter((item) => item !== "");
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {
@ -272,10 +335,14 @@ export class WebScraperDataProvider {
} }
private async getSitemapData(baseUrl: string, documents: Document[]) { private async getSitemapData(baseUrl: string, documents: Document[]) {
const sitemapData = await fetchSitemapData(baseUrl) const sitemapData = await fetchSitemapData(baseUrl);
if (sitemapData) { if (sitemapData) {
for (let i = 0; i < documents.length; i++) { for (let i = 0; i < documents.length; i++) {
const docInSitemapData = sitemapData.find((data) => this.normalizeUrl(data.loc) === this.normalizeUrl(documents[i].metadata.sourceURL)) const docInSitemapData = sitemapData.find(
(data) =>
this.normalizeUrl(data.loc) ===
this.normalizeUrl(documents[i].metadata.sourceURL)
);
if (docInSitemapData) { if (docInSitemapData) {
let sitemapDocData: Partial<SitemapEntry> = {}; let sitemapDocData: Partial<SitemapEntry> = {};
if (docInSitemapData.changefreq) { if (docInSitemapData.changefreq) {
@ -296,30 +363,83 @@ export class WebScraperDataProvider {
return documents; return documents;
} }
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => { generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
await Promise.all(documents.map(async (document) => { await Promise.all(
const baseUrl = new URL(document.metadata.sourceURL).origin; documents.map(async (document) => {
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
await Promise.all(images.map(async (image) => { await Promise.all(
let imageUrl = image.match(/\(([^)]+)\)/)[1]; images.map(async (image: string) => {
let altText = image.match(/\[(.*?)\]/)[1]; let imageUrl = image.match(/\(([^)]+)\)/)[1];
let newImageUrl = ''; let altText = image.match(/\[(.*?)\]/)[1];
if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { if (
newImageUrl = baseUrl + imageUrl; !altText &&
const imageIndex = document.content.indexOf(image); !imageUrl.startsWith("data:image") &&
const contentLength = document.content.length; /\.(png|jpeg|gif|webp)$/.test(imageUrl)
let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); ) {
let frontTextStartIndex = Math.max(imageIndex - 1000, 0); const imageIndex = document.content.indexOf(image);
let frontText = document.content.substring(frontTextStartIndex, imageIndex); const contentLength = document.content.length;
altText = await getImageDescription(newImageUrl, backText, frontText); let backText = document.content.substring(
} imageIndex + image.length,
Math.min(imageIndex + image.length + 1000, contentLength)
);
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
let frontText = document.content.substring(
frontTextStartIndex,
imageIndex
);
altText = await getImageDescription(
imageUrl,
backText,
frontText
);
}
document.content = document.content.replace(image, `![${altText}](${newImageUrl})`); document.content = document.content.replace(
})); image,
})); `![${altText}](${imageUrl})`
);
})
);
})
);
return documents; return documents;
} };
}
replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
try {
documents.forEach((document) => {
const baseUrl = new URL(document.metadata.sourceURL).origin;
const images =
document.content.match(
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
) || [];
images.forEach((image: string) => {
let imageUrl = image.match(/\(([^)]+)\)/)[1];
let altText = image.match(/\[(.*?)\]/)[1];
if (!imageUrl.startsWith("data:image")) {
if (!imageUrl.startsWith("http")) {
if (imageUrl.startsWith("/")) {
imageUrl = imageUrl.substring(1);
}
imageUrl = new URL(imageUrl, baseUrl).toString();
}
}
document.content = document.content.replace(
image,
`![${altText}](${imageUrl})`
);
});
});
return documents;
} catch (error) {
console.error("Error replacing img paths with absolute paths", error);
return documents;
}
};
}