changed the request to do a HEAD to check for a PDF instead
This commit is contained in:
parent
f1dd97af0f
commit
f8b207793f
@ -170,6 +170,45 @@ describe('E2E Tests for API Routes', () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
|
||||
expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
// it('should return a successful response for a valid crawl job with PDF content', async () => {
|
||||
|
||||
// });
|
||||
|
||||
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post('/v0/crawl')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('status');
|
||||
expect(response.body.status).toBe('active');
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 60000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
|
||||
console.log(completedResponse.body.data)
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty('status');
|
||||
expect(completedResponse.body.status).toBe('completed');
|
||||
expect(completedResponse.body).toHaveProperty('data');
|
||||
expect(completedResponse.body.data.length).toBeGreaterThan(1);
|
||||
expect(completedResponse.body.data[0]).toHaveProperty('content');
|
||||
expect(completedResponse.body.data[0]).toHaveProperty('markdown');
|
||||
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
|
||||
expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208');
|
||||
}, 90000); // 60 seconds
|
||||
|
||||
|
||||
});
|
||||
|
||||
describe('GET /is-production', () => {
|
||||
|
@ -88,9 +88,17 @@ export class WebScraperDataProvider {
|
||||
}));
|
||||
}
|
||||
|
||||
let pdfLinks = links.filter(
|
||||
async (link) => await isUrlAPdf({ url: link, fastMode: true })
|
||||
);
|
||||
let pdfLinks = [];
|
||||
let notPdfLinks = [];
|
||||
for (let link of links) {
|
||||
if (await isUrlAPdf({ url: link })) {
|
||||
pdfLinks.push(link);
|
||||
} else {
|
||||
notPdfLinks.push(link);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("crawl", {pdfLinks})
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
@ -100,11 +108,8 @@ export class WebScraperDataProvider {
|
||||
provider: "web-scraper",
|
||||
});
|
||||
}
|
||||
links = links.filter(
|
||||
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
|
||||
);
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||
let documents = await this.convertUrlsToDocuments(notPdfLinks, inProgress);
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
|
||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||
@ -164,7 +169,7 @@ export class WebScraperDataProvider {
|
||||
let pdfDocuments: Document[] = [];
|
||||
let nonPdfUrls: string[] = [];
|
||||
for (let url of this.urls) {
|
||||
if (await isUrlAPdf({ url: url, fastMode: false })) {
|
||||
if (await isUrlAPdf({ url: url })) {
|
||||
const pdfContent = await fetchAndProcessPdf(url);
|
||||
pdfDocuments.push({
|
||||
content: pdfContent,
|
||||
@ -201,9 +206,17 @@ export class WebScraperDataProvider {
|
||||
}
|
||||
if (this.mode === "sitemap") {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
let pdfLinks = links.filter(
|
||||
async (link) => await isUrlAPdf({ url: link, fastMode: true })
|
||||
);
|
||||
|
||||
let pdfLinks = [];
|
||||
let nonPdfLinks = [];
|
||||
for (let link of links) {
|
||||
if (await isUrlAPdf({ url: link })) {
|
||||
pdfLinks.push(link);
|
||||
} else {
|
||||
nonPdfLinks.push(link);
|
||||
}
|
||||
}
|
||||
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
@ -213,12 +226,9 @@ export class WebScraperDataProvider {
|
||||
provider: "web-scraper",
|
||||
});
|
||||
}
|
||||
links = links.filter(
|
||||
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
|
||||
);
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
links.slice(0, this.limit),
|
||||
nonPdfLinks.slice(0, this.limit),
|
||||
inProgress
|
||||
);
|
||||
|
||||
|
@ -114,10 +114,10 @@ async function processPdf(file: string) {
|
||||
*/
|
||||
export async function isUrlAPdf({
|
||||
url,
|
||||
fastMode,
|
||||
fastMode = false,
|
||||
}: {
|
||||
url: string;
|
||||
fastMode: boolean;
|
||||
fastMode?: boolean;
|
||||
}): Promise<boolean> {
|
||||
try {
|
||||
if (url.endsWith(".pdf")) {
|
||||
@ -127,11 +127,11 @@ export async function isUrlAPdf({
|
||||
if (fastMode) {
|
||||
return false;
|
||||
}
|
||||
const response = await fetch(url, { method: "HEAD" });
|
||||
const contentType = response.headers.get("Content-Type");
|
||||
return contentType !== null && contentType.includes("application/pdf");
|
||||
const response = await axios.head(url);
|
||||
const contentType = response.headers['content-type'];
|
||||
return contentType.includes('application/pdf');
|
||||
} catch (error) {
|
||||
console.error("Error making HEAD request:", error);
|
||||
// console.error("Error making HEAD request:", error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user