Nick: fixes most of it
This commit is contained in:
parent
d91043376c
commit
bfccaf670d
@ -76,9 +76,22 @@ export class WebCrawler {
|
|||||||
|
|
||||||
// Check if the link matches the include patterns, if any are specified
|
// Check if the link matches the include patterns, if any are specified
|
||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
return this.includes.some((includePattern) =>
|
if (!this.includes.some((includePattern) =>
|
||||||
new RegExp(includePattern).test(path)
|
new RegExp(includePattern).test(path)
|
||||||
);
|
)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize the initial URL and the link to account for www and non-www versions
|
||||||
|
const normalizedInitialUrl = new URL(this.initialUrl);
|
||||||
|
const normalizedLink = new URL(link);
|
||||||
|
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
|
||||||
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
||||||
|
|
||||||
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||||
|
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||||
@ -88,10 +101,6 @@ export class WebCrawler {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.initialUrl.includes(link)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
})
|
})
|
||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
@ -109,11 +118,15 @@ export class WebCrawler {
|
|||||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log("Initial URL: ", this.initialUrl);
|
||||||
|
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
|
console.log("Filtered links: ", filteredLinks.length);
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -310,7 +323,21 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Error handling for failed sitemap fetch
|
// Error handling for failed sitemap fetch
|
||||||
|
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the first one doesn't work, try the base URL
|
||||||
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
|
try {
|
||||||
|
const response = await axios.get(baseUrlSitemap);
|
||||||
|
if (response.status === 200) {
|
||||||
|
return await getLinksFromSitemap(baseUrlSitemap);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
// Error handling for failed base URL sitemap fetch
|
||||||
|
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
|
}
|
||||||
|
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -130,6 +130,21 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async cleanIrrelevantPath(links: string[]){
|
||||||
|
return links.filter(link => {
|
||||||
|
const normalizedInitialUrl = new URL(this.urls[0]);
|
||||||
|
const normalizedLink = new URL(link);
|
||||||
|
|
||||||
|
// Normalize the hostname to account for www and non-www versions
|
||||||
|
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
|
||||||
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
||||||
|
|
||||||
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||||
|
return linkHostname === initialHostname &&
|
||||||
|
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
private async handleCrawlMode(
|
private async handleCrawlMode(
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
@ -149,11 +164,11 @@ export class WebScraperDataProvider {
|
|||||||
let allLinks = links.map((e) => e.url);
|
let allLinks = links.map((e) => e.url);
|
||||||
const allHtmls = links.map((e)=> e.html);
|
const allHtmls = links.map((e)=> e.html);
|
||||||
|
|
||||||
allLinks = allLinks.filter(link => {
|
console.log(">>>>>> all links >>>>", {allLinks})
|
||||||
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
// allLinks = await this.cleanIrrelevantPath(allLinks);
|
||||||
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
|
||||||
return normalizedLink.startsWith(normalizedInitialUrl);
|
|
||||||
});
|
|
||||||
console.log('>>>>>??>?>?>?>?.', {allLinks})
|
console.log('>>>>>??>?>?>?>?.', {allLinks})
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
@ -183,13 +198,7 @@ export class WebScraperDataProvider {
|
|||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
links = links.filter(link => {
|
links = await this.cleanIrrelevantPath(links);
|
||||||
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
|
||||||
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
|
||||||
return normalizedLink.startsWith(normalizedInitialUrl);
|
|
||||||
});
|
|
||||||
|
|
||||||
console.log('>>>>>??>?>?>?>?.', {links})
|
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
|
@ -27,7 +27,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"website": "https://agentops.ai",
|
"website": "https://agentops.ai/blog",
|
||||||
"expected_min_num_of_pages": 7,
|
"expected_min_num_of_pages": 7,
|
||||||
"expected_crawled_pages": [
|
"expected_crawled_pages": [
|
||||||
"https://www.agentops.ai/blog/effortless-hr-management-with-saas",
|
"https://www.agentops.ai/blog/effortless-hr-management-with-saas",
|
||||||
|
Loading…
Reference in New Issue
Block a user