not working yet
This commit is contained in:
parent
fa014defc7
commit
d91043376c
@ -133,6 +133,7 @@ export class WebScraperDataProvider {
|
|||||||
private async handleCrawlMode(
|
private async handleCrawlMode(
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
|
console.log('??? >>>', this.urls[0])
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
@ -148,15 +149,16 @@ export class WebScraperDataProvider {
|
|||||||
let allLinks = links.map((e) => e.url);
|
let allLinks = links.map((e) => e.url);
|
||||||
const allHtmls = links.map((e)=> e.html);
|
const allHtmls = links.map((e)=> e.html);
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
|
||||||
return this.returnOnlyUrlsResponse(allLinks , inProgress);
|
|
||||||
}
|
|
||||||
|
|
||||||
allLinks = allLinks.filter(link => {
|
allLinks = allLinks.filter(link => {
|
||||||
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
||||||
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
||||||
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
|
return normalizedLink.startsWith(normalizedInitialUrl);
|
||||||
});
|
});
|
||||||
|
console.log('>>>>>??>?>?>?>?.', {allLinks})
|
||||||
|
|
||||||
|
if (this.returnOnlyUrls) {
|
||||||
|
return this.returnOnlyUrlsResponse(allLinks , inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
let documents = [];
|
let documents = [];
|
||||||
// check if fast mode is enabled and there is html inside the links
|
// check if fast mode is enabled and there is html inside the links
|
||||||
@ -184,9 +186,11 @@ export class WebScraperDataProvider {
|
|||||||
links = links.filter(link => {
|
links = links.filter(link => {
|
||||||
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
||||||
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
||||||
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
|
return normalizedLink.startsWith(normalizedInitialUrl);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
console.log('>>>>>??>?>?>?>?.', {links})
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
}
|
}
|
||||||
|
@ -102,7 +102,7 @@ describe("Crawling Checkup (E2E)", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// checks if crawled pages not contain expected_not_crawled_pages
|
// checks if crawled pages not contain expected_not_crawled_pages
|
||||||
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
|
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
|
||||||
errorLog.push({
|
errorLog.push({
|
||||||
website: websiteData.website,
|
website: websiteData.website,
|
||||||
prompt: 'CRAWL',
|
prompt: 'CRAWL',
|
||||||
|
Loading…
Reference in New Issue
Block a user