0

changed the request to do a HEAD to check for a PDF instead

This commit is contained in:
rafaelsideguide 2024-04-29 15:15:32 -03:00
parent f1dd97af0f
commit f8b207793f
3 changed files with 70 additions and 21 deletions

View File

@ -170,6 +170,45 @@ describe('E2E Tests for API Routes', () => {
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl');
}, 60000); // 60 seconds
// it('should return a successful response for a valid crawl job with PDF content', async () => {
// });
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
const crawlResponse = await request(TEST_URL)
.post('/v0/crawl')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('status');
expect(response.body.status).toBe('active');
// wait for 30 seconds
await new Promise((r) => setTimeout(r, 60000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
console.log(completedResponse.body.data)
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty('status');
expect(completedResponse.body.status).toBe('completed');
expect(completedResponse.body).toHaveProperty('data');
expect(completedResponse.body.data.length).toBeGreaterThan(1);
expect(completedResponse.body.data[0]).toHaveProperty('content');
expect(completedResponse.body.data[0]).toHaveProperty('markdown');
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208');
}, 90000); // 60 seconds
});
describe('GET /is-production', () => {

View File

@ -88,9 +88,17 @@ export class WebScraperDataProvider {
}));
}
let pdfLinks = links.filter(
async (link) => await isUrlAPdf({ url: link, fastMode: true })
);
let pdfLinks = [];
let notPdfLinks = [];
for (let link of links) {
if (await isUrlAPdf({ url: link })) {
pdfLinks.push(link);
} else {
notPdfLinks.push(link);
}
}
console.log("crawl", {pdfLinks})
let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -100,11 +108,8 @@ export class WebScraperDataProvider {
provider: "web-scraper",
});
}
links = links.filter(
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
);
let documents = await this.convertUrlsToDocuments(links, inProgress);
let documents = await this.convertUrlsToDocuments(notPdfLinks, inProgress);
documents = await this.getSitemapData(this.urls[0], documents);
if (this.replaceAllPathsWithAbsolutePaths) {
@ -164,7 +169,7 @@ export class WebScraperDataProvider {
let pdfDocuments: Document[] = [];
let nonPdfUrls: string[] = [];
for (let url of this.urls) {
if (await isUrlAPdf({ url: url, fastMode: false })) {
if (await isUrlAPdf({ url: url })) {
const pdfContent = await fetchAndProcessPdf(url);
pdfDocuments.push({
content: pdfContent,
@ -201,9 +206,17 @@ export class WebScraperDataProvider {
}
if (this.mode === "sitemap") {
let links = await getLinksFromSitemap(this.urls[0]);
let pdfLinks = links.filter(
async (link) => await isUrlAPdf({ url: link, fastMode: true })
);
let pdfLinks = [];
let nonPdfLinks = [];
for (let link of links) {
if (await isUrlAPdf({ url: link })) {
pdfLinks.push(link);
} else {
nonPdfLinks.push(link);
}
}
let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -213,12 +226,9 @@ export class WebScraperDataProvider {
provider: "web-scraper",
});
}
links = links.filter(
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
);
let documents = await this.convertUrlsToDocuments(
links.slice(0, this.limit),
nonPdfLinks.slice(0, this.limit),
inProgress
);

View File

@ -114,10 +114,10 @@ async function processPdf(file: string) {
*/
export async function isUrlAPdf({
url,
fastMode,
fastMode = false,
}: {
url: string;
fastMode: boolean;
fastMode?: boolean;
}): Promise<boolean> {
try {
if (url.endsWith(".pdf")) {
@ -127,11 +127,11 @@ export async function isUrlAPdf({
if (fastMode) {
return false;
}
const response = await fetch(url, { method: "HEAD" });
const contentType = response.headers.get("Content-Type");
return contentType !== null && contentType.includes("application/pdf");
const response = await axios.head(url);
const contentType = response.headers['content-type'];
return contentType.includes('application/pdf');
} catch (error) {
console.error("Error making HEAD request:", error);
// console.error("Error making HEAD request:", error);
return false;
}
}