Merge pull request #4 from mendableai/feat/improving-reative-paths
[Feat] improving reative paths
This commit is contained in:
commit
d48027675c
@ -91,6 +91,7 @@ app.post("/v0/scrape", async (req, res) => {
|
|||||||
if (!success) {
|
if (!success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||||
@ -114,6 +115,9 @@ app.post("/v0/scrape", async (req, res) => {
|
|||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
urls: [url],
|
||||||
|
crawlerOptions: {
|
||||||
|
...crawlerOptions,
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
const docs = await a.getDocuments(false);
|
||||||
|
171
apps/api/src/scraper/WebScraper/__tests__/index.test.ts
Normal file
171
apps/api/src/scraper/WebScraper/__tests__/index.test.ts
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
import { WebScraperDataProvider } from "../index";
|
||||||
|
|
||||||
|
describe("WebScraperDataProvider", () => {
|
||||||
|
describe("replaceImgPathsWithAbsolutePaths", () => {
|
||||||
|
it("should replace image paths with absolute paths", () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content: "![alt text](/image.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content: "![another alt text](./another-image.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/data-image" },
|
||||||
|
content: "![data image](data:image/png;base64,...)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content: "![alt text](https://example.com/image.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content: "![another alt text](https://example.com/another-image.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/data-image" },
|
||||||
|
content: "![data image](data:image/png;base64,...)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const result =
|
||||||
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle absolute URLs without modification", () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content: "![alt text](https://example.com/image.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content:
|
||||||
|
"![another alt text](http://anotherexample.com/another-image.png)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content: "![alt text](https://example.com/image.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content:
|
||||||
|
"![another alt text](http://anotherexample.com/another-image.png)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const result =
|
||||||
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should not replace non-image content within the documents", () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content:
|
||||||
|
"This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content:
|
||||||
|
"Another test. ![another alt text](./another-image.png) Here is some **bold text**.",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content:
|
||||||
|
"This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content:
|
||||||
|
"Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const result =
|
||||||
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
it("should replace multiple image paths within the documents", () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content:
|
||||||
|
"This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content:
|
||||||
|
"Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page" },
|
||||||
|
content:
|
||||||
|
"This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content:
|
||||||
|
"Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const result =
|
||||||
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should replace image paths within the documents with complex URLs", () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page/subpage" },
|
||||||
|
content:
|
||||||
|
"This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
||||||
|
content:
|
||||||
|
"Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/page/subpage" },
|
||||||
|
content:
|
||||||
|
"This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
||||||
|
content:
|
||||||
|
"Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const result =
|
||||||
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
@ -49,7 +49,8 @@ export class WebScraperDataProvider {
|
|||||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
||||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
await Promise.all(batchUrls.map(async (url, index) => {
|
await Promise.all(
|
||||||
|
batchUrls.map(async (url, index) => {
|
||||||
const result = await scrapSingleUrl(url, true);
|
const result = await scrapSingleUrl(url, true);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
@ -61,7 +62,8 @@ export class WebScraperDataProvider {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
results[i + index] = result;
|
results[i + index] = result;
|
||||||
}));
|
})
|
||||||
|
);
|
||||||
}
|
}
|
||||||
return results.filter((result) => result !== null) as Document[];
|
return results.filter((result) => result !== null) as Document[];
|
||||||
}
|
}
|
||||||
@ -95,33 +97,59 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
console.log("documents", documents)
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
|
|
||||||
// CACHING DOCUMENTS
|
// CACHING DOCUMENTS
|
||||||
// - parent document
|
// - parent document
|
||||||
const cachedParentDocumentString = await getValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]));
|
const cachedParentDocumentString = await getValue(
|
||||||
|
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
|
||||||
|
);
|
||||||
if (cachedParentDocumentString != null) {
|
if (cachedParentDocumentString != null) {
|
||||||
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
|
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
|
||||||
if (!cachedParentDocument.childrenLinks || cachedParentDocument.childrenLinks.length < links.length - 1) {
|
if (
|
||||||
cachedParentDocument.childrenLinks = links.filter((link) => link !== this.urls[0]);
|
!cachedParentDocument.childrenLinks ||
|
||||||
await setValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]), JSON.stringify(cachedParentDocument), 60 * 60 * 24 * 10); // 10 days
|
cachedParentDocument.childrenLinks.length < links.length - 1
|
||||||
|
) {
|
||||||
|
cachedParentDocument.childrenLinks = links.filter(
|
||||||
|
(link) => link !== this.urls[0]
|
||||||
|
);
|
||||||
|
await setValue(
|
||||||
|
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
|
||||||
|
JSON.stringify(cachedParentDocument),
|
||||||
|
60 * 60 * 24 * 10
|
||||||
|
); // 10 days
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let parentDocument = documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) === this.normalizeUrl(this.urls[0]))
|
let parentDocument = documents.filter(
|
||||||
|
(document) =>
|
||||||
|
this.normalizeUrl(document.metadata.sourceURL) ===
|
||||||
|
this.normalizeUrl(this.urls[0])
|
||||||
|
);
|
||||||
await this.setCachedDocuments(parentDocument, links);
|
await this.setCachedDocuments(parentDocument, links);
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.setCachedDocuments(documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) !== this.normalizeUrl(this.urls[0])), []);
|
await this.setCachedDocuments(
|
||||||
|
documents.filter(
|
||||||
|
(document) =>
|
||||||
|
this.normalizeUrl(document.metadata.sourceURL) !==
|
||||||
|
this.normalizeUrl(this.urls[0])
|
||||||
|
),
|
||||||
|
[]
|
||||||
|
);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
documents = documents.splice(0, this.limit);
|
documents = documents.splice(0, this.limit);
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.mode === "single_urls") {
|
if (this.mode === "single_urls") {
|
||||||
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
let documents = await this.convertUrlsToDocuments(
|
||||||
|
this.urls,
|
||||||
|
inProgress
|
||||||
|
);
|
||||||
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -135,9 +163,13 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
if (this.mode === "sitemap") {
|
if (this.mode === "sitemap") {
|
||||||
const links = await getLinksFromSitemap(this.urls[0]);
|
const links = await getLinksFromSitemap(this.urls[0]);
|
||||||
let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
|
let documents = await this.convertUrlsToDocuments(
|
||||||
|
links.slice(0, this.limit),
|
||||||
|
inProgress
|
||||||
|
);
|
||||||
|
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -151,11 +183,22 @@ export class WebScraperDataProvider {
|
|||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
|
let documents = await this.getCachedDocuments(
|
||||||
|
this.urls.slice(0, this.limit)
|
||||||
|
);
|
||||||
if (documents.length < this.limit) {
|
if (documents.length < this.limit) {
|
||||||
const newDocuments: Document[] = await this.getDocuments(false, inProgress);
|
const newDocuments: Document[] = await this.getDocuments(
|
||||||
newDocuments.forEach(doc => {
|
false,
|
||||||
if (!documents.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
|
inProgress
|
||||||
|
);
|
||||||
|
newDocuments.forEach((doc) => {
|
||||||
|
if (
|
||||||
|
!documents.some(
|
||||||
|
(d) =>
|
||||||
|
this.normalizeUrl(d.metadata.sourceURL) ===
|
||||||
|
this.normalizeUrl(doc.metadata?.sourceURL)
|
||||||
|
)
|
||||||
|
) {
|
||||||
documents.push(doc);
|
documents.push(doc);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -171,17 +214,23 @@ export class WebScraperDataProvider {
|
|||||||
const url = new URL(document.metadata.sourceURL);
|
const url = new URL(document.metadata.sourceURL);
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
|
|
||||||
if (this.excludes.length > 0 && this.excludes[0] !== '') {
|
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||||
// Check if the link should be excluded
|
// Check if the link should be excluded
|
||||||
if (this.excludes.some(excludePattern => new RegExp(excludePattern).test(path))) {
|
if (
|
||||||
|
this.excludes.some((excludePattern) =>
|
||||||
|
new RegExp(excludePattern).test(path)
|
||||||
|
)
|
||||||
|
) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.includes.length > 0 && this.includes[0] !== '') {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
// Check if the link matches the include patterns, if any are specified
|
// Check if the link matches the include patterns, if any are specified
|
||||||
if (this.includes.length > 0) {
|
if (this.includes.length > 0) {
|
||||||
return this.includes.some(includePattern => new RegExp(includePattern).test(path));
|
return this.includes.some((includePattern) =>
|
||||||
|
new RegExp(includePattern).test(path)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -198,7 +247,7 @@ export class WebScraperDataProvider {
|
|||||||
private removeChildLinks(documents: Document[]): Document[] {
|
private removeChildLinks(documents: Document[]): Document[] {
|
||||||
for (let document of documents) {
|
for (let document of documents) {
|
||||||
if (document?.childrenLinks) delete document.childrenLinks;
|
if (document?.childrenLinks) delete document.childrenLinks;
|
||||||
};
|
}
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,10 +257,14 @@ export class WebScraperDataProvider {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
|
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
|
||||||
await setValue('web-scraper-cache:' + normalizedUrl, JSON.stringify({
|
await setValue(
|
||||||
|
"web-scraper-cache:" + normalizedUrl,
|
||||||
|
JSON.stringify({
|
||||||
...document,
|
...document,
|
||||||
childrenLinks: childrenLinks || []
|
childrenLinks: childrenLinks || [],
|
||||||
}), 60 * 60 * 24 * 10); // 10 days
|
}),
|
||||||
|
60 * 60 * 24 * 10
|
||||||
|
); // 10 days
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,8 +272,12 @@ export class WebScraperDataProvider {
|
|||||||
let documents: Document[] = [];
|
let documents: Document[] = [];
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
const normalizedUrl = this.normalizeUrl(url);
|
const normalizedUrl = this.normalizeUrl(url);
|
||||||
console.log("Getting cached document for web-scraper-cache:" + normalizedUrl)
|
console.log(
|
||||||
const cachedDocumentString = await getValue('web-scraper-cache:' + normalizedUrl);
|
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
||||||
|
);
|
||||||
|
const cachedDocumentString = await getValue(
|
||||||
|
"web-scraper-cache:" + normalizedUrl
|
||||||
|
);
|
||||||
if (cachedDocumentString) {
|
if (cachedDocumentString) {
|
||||||
const cachedDocument = JSON.parse(cachedDocumentString);
|
const cachedDocument = JSON.parse(cachedDocumentString);
|
||||||
documents.push(cachedDocument);
|
documents.push(cachedDocument);
|
||||||
@ -228,10 +285,18 @@ export class WebScraperDataProvider {
|
|||||||
// get children documents
|
// get children documents
|
||||||
for (const childUrl of cachedDocument.childrenLinks) {
|
for (const childUrl of cachedDocument.childrenLinks) {
|
||||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||||
const childCachedDocumentString = await getValue('web-scraper-cache:' + normalizedChildUrl);
|
const childCachedDocumentString = await getValue(
|
||||||
|
"web-scraper-cache:" + normalizedChildUrl
|
||||||
|
);
|
||||||
if (childCachedDocumentString) {
|
if (childCachedDocumentString) {
|
||||||
const childCachedDocument = JSON.parse(childCachedDocumentString);
|
const childCachedDocument = JSON.parse(childCachedDocumentString);
|
||||||
if (!documents.find((doc) => doc.metadata.sourceURL === childCachedDocument.metadata.sourceURL)) {
|
if (
|
||||||
|
!documents.find(
|
||||||
|
(doc) =>
|
||||||
|
doc.metadata.sourceURL ===
|
||||||
|
childCachedDocument.metadata.sourceURL
|
||||||
|
)
|
||||||
|
) {
|
||||||
documents.push(childCachedDocument);
|
documents.push(childCachedDocument);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -246,7 +311,6 @@ export class WebScraperDataProvider {
|
|||||||
throw new Error("Urls are required");
|
throw new Error("Urls are required");
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log("options", options.crawlerOptions?.excludes)
|
|
||||||
this.urls = options.urls;
|
this.urls = options.urls;
|
||||||
this.mode = options.mode;
|
this.mode = options.mode;
|
||||||
this.concurrentRequests = options.concurrentRequests ?? 20;
|
this.concurrentRequests = options.concurrentRequests ?? 20;
|
||||||
@ -255,12 +319,11 @@ export class WebScraperDataProvider {
|
|||||||
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
||||||
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false;
|
this.generateImgAltText =
|
||||||
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter(item => item !== '');
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
|
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
@ -272,10 +335,14 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async getSitemapData(baseUrl: string, documents: Document[]) {
|
private async getSitemapData(baseUrl: string, documents: Document[]) {
|
||||||
const sitemapData = await fetchSitemapData(baseUrl)
|
const sitemapData = await fetchSitemapData(baseUrl);
|
||||||
if (sitemapData) {
|
if (sitemapData) {
|
||||||
for (let i = 0; i < documents.length; i++) {
|
for (let i = 0; i < documents.length; i++) {
|
||||||
const docInSitemapData = sitemapData.find((data) => this.normalizeUrl(data.loc) === this.normalizeUrl(documents[i].metadata.sourceURL))
|
const docInSitemapData = sitemapData.find(
|
||||||
|
(data) =>
|
||||||
|
this.normalizeUrl(data.loc) ===
|
||||||
|
this.normalizeUrl(documents[i].metadata.sourceURL)
|
||||||
|
);
|
||||||
if (docInSitemapData) {
|
if (docInSitemapData) {
|
||||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
let sitemapDocData: Partial<SitemapEntry> = {};
|
||||||
if (docInSitemapData.changefreq) {
|
if (docInSitemapData.changefreq) {
|
||||||
@ -296,30 +363,83 @@ export class WebScraperDataProvider {
|
|||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
||||||
await Promise.all(documents.map(async (document) => {
|
await Promise.all(
|
||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
documents.map(async (document) => {
|
||||||
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
|
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
|
||||||
|
|
||||||
await Promise.all(images.map(async (image) => {
|
await Promise.all(
|
||||||
|
images.map(async (image: string) => {
|
||||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||||
let altText = image.match(/\[(.*?)\]/)[1];
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
let newImageUrl = '';
|
|
||||||
|
|
||||||
if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
|
if (
|
||||||
newImageUrl = baseUrl + imageUrl;
|
!altText &&
|
||||||
|
!imageUrl.startsWith("data:image") &&
|
||||||
|
/\.(png|jpeg|gif|webp)$/.test(imageUrl)
|
||||||
|
) {
|
||||||
const imageIndex = document.content.indexOf(image);
|
const imageIndex = document.content.indexOf(image);
|
||||||
const contentLength = document.content.length;
|
const contentLength = document.content.length;
|
||||||
let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
|
let backText = document.content.substring(
|
||||||
|
imageIndex + image.length,
|
||||||
|
Math.min(imageIndex + image.length + 1000, contentLength)
|
||||||
|
);
|
||||||
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
||||||
let frontText = document.content.substring(frontTextStartIndex, imageIndex);
|
let frontText = document.content.substring(
|
||||||
altText = await getImageDescription(newImageUrl, backText, frontText);
|
frontTextStartIndex,
|
||||||
|
imageIndex
|
||||||
|
);
|
||||||
|
altText = await getImageDescription(
|
||||||
|
imageUrl,
|
||||||
|
backText,
|
||||||
|
frontText
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.content = document.content.replace(image, `![${altText}](${newImageUrl})`);
|
document.content = document.content.replace(
|
||||||
}));
|
image,
|
||||||
}));
|
`![${altText}](${imageUrl})`
|
||||||
|
);
|
||||||
|
})
|
||||||
|
);
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
|
};
|
||||||
|
|
||||||
|
replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||||
|
try {
|
||||||
|
documents.forEach((document) => {
|
||||||
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
|
const images =
|
||||||
|
document.content.match(
|
||||||
|
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
|
||||||
|
) || [];
|
||||||
|
|
||||||
|
images.forEach((image: string) => {
|
||||||
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||||
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
|
|
||||||
|
if (!imageUrl.startsWith("data:image")) {
|
||||||
|
if (!imageUrl.startsWith("http")) {
|
||||||
|
if (imageUrl.startsWith("/")) {
|
||||||
|
imageUrl = imageUrl.substring(1);
|
||||||
|
}
|
||||||
|
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
document.content = document.content.replace(
|
||||||
|
image,
|
||||||
|
`![${altText}](${imageUrl})`
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return documents;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error replacing img paths with absolute paths", error);
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user