0
This commit is contained in:
Nicolas 2024-04-19 15:36:00 -07:00
parent 005ac8f839
commit 84cebf618b
2 changed files with 11 additions and 5 deletions

View File

@ -42,6 +42,7 @@ describe('E2E Tests for API Routes', () => {
.set('Authorization', `Bearer this_is_just_a_preview_token`) .set('Authorization', `Bearer this_is_just_a_preview_token`)
.set('Content-Type', 'application/json') .set('Content-Type', 'application/json')
.send({ url: 'https://firecrawl.dev' }); .send({ url: 'https://firecrawl.dev' });
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
}, 10000); // 10 seconds timeout }, 10000); // 10 seconds timeout
@ -51,6 +52,8 @@ describe('E2E Tests for API Routes', () => {
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set('Content-Type', 'application/json')
.send({ url: 'https://firecrawl.dev' }); .send({ url: 'https://firecrawl.dev' });
await new Promise((r) => setTimeout(r, 2000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('content');

View File

@ -88,7 +88,7 @@ export class WebScraperDataProvider {
})); }));
} }
let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) { for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -98,7 +98,7 @@ export class WebScraperDataProvider {
provider: "web-scraper" provider: "web-scraper"
}); });
} }
links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
let documents = await this.convertUrlsToDocuments(links, inProgress); let documents = await this.convertUrlsToDocuments(links, inProgress);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
@ -157,10 +157,12 @@ export class WebScraperDataProvider {
} }
if (this.mode === "single_urls") { if (this.mode === "single_urls") {
console.log("Single urls mode");
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
let nonPdfUrls: string[] = []; let nonPdfUrls: string[] = [];
for (let url of this.urls) { for (let url of this.urls) {
if (isUrlAPdf({url: url, fastMode: false})) { console.log("Checking if url is a pdf", url);
if (await isUrlAPdf({url: url, fastMode: false})) {
const pdfContent = await fetchAndProcessPdf(url); const pdfContent = await fetchAndProcessPdf(url);
pdfDocuments.push({ pdfDocuments.push({
content: pdfContent, content: pdfContent,
@ -169,6 +171,7 @@ export class WebScraperDataProvider {
}); });
} else { } else {
nonPdfUrls.push(url); nonPdfUrls.push(url);
console.log("Fetching and processing url", url);
} }
} }
@ -197,7 +200,7 @@ export class WebScraperDataProvider {
} }
if (this.mode === "sitemap") { if (this.mode === "sitemap") {
let links = await getLinksFromSitemap(this.urls[0]); let links = await getLinksFromSitemap(this.urls[0]);
let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) { for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -207,7 +210,7 @@ export class WebScraperDataProvider {
provider: "web-scraper" provider: "web-scraper"
}); });
} }
links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
let documents = await this.convertUrlsToDocuments( let documents = await this.convertUrlsToDocuments(
links.slice(0, this.limit), links.slice(0, this.limit),