changed the request to do a HEAD to check for a PDF instead
This commit is contained in:
parent
f1dd97af0f
commit
f8b207793f
@ -170,6 +170,45 @@ describe('E2E Tests for API Routes', () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
|
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
|
||||||
expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl');
|
expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl');
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
// it('should return a successful response for a valid crawl job with PDF content', async () => {
|
||||||
|
|
||||||
|
// });
|
||||||
|
|
||||||
|
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post('/v0/crawl')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('status');
|
||||||
|
expect(response.body.status).toBe('active');
|
||||||
|
|
||||||
|
// wait for 30 seconds
|
||||||
|
await new Promise((r) => setTimeout(r, 60000));
|
||||||
|
|
||||||
|
const completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
console.log(completedResponse.body.data)
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty('status');
|
||||||
|
expect(completedResponse.body.status).toBe('completed');
|
||||||
|
expect(completedResponse.body).toHaveProperty('data');
|
||||||
|
expect(completedResponse.body.data.length).toBeGreaterThan(1);
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty('content');
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty('markdown');
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
|
||||||
|
expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208');
|
||||||
|
}, 90000); // 60 seconds
|
||||||
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('GET /is-production', () => {
|
describe('GET /is-production', () => {
|
||||||
|
@ -88,9 +88,17 @@ export class WebScraperDataProvider {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
let pdfLinks = links.filter(
|
let pdfLinks = [];
|
||||||
async (link) => await isUrlAPdf({ url: link, fastMode: true })
|
let notPdfLinks = [];
|
||||||
);
|
for (let link of links) {
|
||||||
|
if (await isUrlAPdf({ url: link })) {
|
||||||
|
pdfLinks.push(link);
|
||||||
|
} else {
|
||||||
|
notPdfLinks.push(link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("crawl", {pdfLinks})
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
for (let pdfLink of pdfLinks) {
|
for (let pdfLink of pdfLinks) {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
@ -100,11 +108,8 @@ export class WebScraperDataProvider {
|
|||||||
provider: "web-scraper",
|
provider: "web-scraper",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
links = links.filter(
|
|
||||||
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(notPdfLinks, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||||
@ -164,7 +169,7 @@ export class WebScraperDataProvider {
|
|||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
let nonPdfUrls: string[] = [];
|
let nonPdfUrls: string[] = [];
|
||||||
for (let url of this.urls) {
|
for (let url of this.urls) {
|
||||||
if (await isUrlAPdf({ url: url, fastMode: false })) {
|
if (await isUrlAPdf({ url: url })) {
|
||||||
const pdfContent = await fetchAndProcessPdf(url);
|
const pdfContent = await fetchAndProcessPdf(url);
|
||||||
pdfDocuments.push({
|
pdfDocuments.push({
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
@ -201,9 +206,17 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
if (this.mode === "sitemap") {
|
if (this.mode === "sitemap") {
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
let pdfLinks = links.filter(
|
|
||||||
async (link) => await isUrlAPdf({ url: link, fastMode: true })
|
let pdfLinks = [];
|
||||||
);
|
let nonPdfLinks = [];
|
||||||
|
for (let link of links) {
|
||||||
|
if (await isUrlAPdf({ url: link })) {
|
||||||
|
pdfLinks.push(link);
|
||||||
|
} else {
|
||||||
|
nonPdfLinks.push(link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
for (let pdfLink of pdfLinks) {
|
for (let pdfLink of pdfLinks) {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
@ -213,12 +226,9 @@ export class WebScraperDataProvider {
|
|||||||
provider: "web-scraper",
|
provider: "web-scraper",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
links = links.filter(
|
|
||||||
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
let documents = await this.convertUrlsToDocuments(
|
||||||
links.slice(0, this.limit),
|
nonPdfLinks.slice(0, this.limit),
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -114,10 +114,10 @@ async function processPdf(file: string) {
|
|||||||
*/
|
*/
|
||||||
export async function isUrlAPdf({
|
export async function isUrlAPdf({
|
||||||
url,
|
url,
|
||||||
fastMode,
|
fastMode = false,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
fastMode: boolean;
|
fastMode?: boolean;
|
||||||
}): Promise<boolean> {
|
}): Promise<boolean> {
|
||||||
try {
|
try {
|
||||||
if (url.endsWith(".pdf")) {
|
if (url.endsWith(".pdf")) {
|
||||||
@ -127,11 +127,11 @@ export async function isUrlAPdf({
|
|||||||
if (fastMode) {
|
if (fastMode) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const response = await fetch(url, { method: "HEAD" });
|
const response = await axios.head(url);
|
||||||
const contentType = response.headers.get("Content-Type");
|
const contentType = response.headers['content-type'];
|
||||||
return contentType !== null && contentType.includes("application/pdf");
|
return contentType.includes('application/pdf');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error making HEAD request:", error);
|
// console.error("Error making HEAD request:", error);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user