0

not working yet

This commit is contained in:
rafaelsideguide 2024-05-15 18:54:40 -03:00
parent fa014defc7
commit d91043376c
2 changed files with 11 additions and 7 deletions

View File

@ -133,6 +133,7 @@ export class WebScraperDataProvider {
private async handleCrawlMode(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
console.log('??? >>>', this.urls[0])
const crawler = new WebCrawler({
initialUrl: this.urls[0],
includes: this.includes,
@ -148,15 +149,16 @@ export class WebScraperDataProvider {
let allLinks = links.map((e) => e.url);
const allHtmls = links.map((e)=> e.html);
if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(allLinks , inProgress);
}
allLinks = allLinks.filter(link => {
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
return normalizedLink.startsWith(normalizedInitialUrl);
});
console.log('>>>>>??>?>?>?>?.', {allLinks})
if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(allLinks , inProgress);
}
let documents = [];
// check if fast mode is enabled and there is html inside the links
@ -184,9 +186,11 @@ export class WebScraperDataProvider {
links = links.filter(link => {
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
return normalizedLink.startsWith(normalizedInitialUrl);
});
console.log('>>>>>??>?>?>?>?.', {links})
if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress);
}

View File

@ -102,7 +102,7 @@ describe("Crawling Checkup (E2E)", () => {
}
// checks if crawled pages not contain expected_not_crawled_pages
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',