Added check during scraping to deal with pdfs
Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy.
This commit is contained in:
parent
5a2712fa5a
commit
f4348024c6
@ -117,7 +117,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
}, 30000); // 30 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
@ -132,7 +132,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
}, 30000); // 30 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawl", () => {
|
describe("POST /v0/crawl", () => {
|
||||||
@ -427,10 +427,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({ url: "https://jestjs.io" });
|
.send({ url: "https://jestjs.io" });
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// wait for 30 seconds
|
// wait for 30 seconds
|
||||||
await new Promise((r) => setTimeout(r, 10000));
|
await new Promise((r) => setTimeout(r, 20000));
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
||||||
@ -439,7 +437,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("status");
|
expect(response.body).toHaveProperty("status");
|
||||||
expect(response.body.status).toBe("cancelled");
|
expect(response.body.status).toBe("cancelled");
|
||||||
|
|
||||||
await new Promise((r) => setTimeout(r, 20000));
|
await new Promise((r) => setTimeout(r, 10000));
|
||||||
|
|
||||||
const completedResponse = await request(TEST_URL)
|
const completedResponse = await request(TEST_URL)
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
@ -456,8 +454,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
|
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
it("should extract data using LLM extraction mode", async () => {
|
it("should extract data using LLM extraction mode", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
|
@ -144,14 +144,23 @@ export class WebScraperDataProvider {
|
|||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||||
|
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||||
|
|
||||||
let documents = await this.processLinks(links, inProgress);
|
let documents = await this.processLinks(links, inProgress);
|
||||||
|
// documents.push(...pdfDocuments);
|
||||||
return this.cacheAndFinalizeDocuments(documents, links);
|
return this.cacheAndFinalizeDocuments(documents, links);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async handleSingleUrlsMode(
|
private async handleSingleUrlsMode(
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
let documents = await this.processLinks(this.urls, inProgress);
|
const links = this.urls;
|
||||||
|
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||||
|
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||||
|
|
||||||
|
let documents = await this.processLinks(links, inProgress);
|
||||||
|
// documents.push(...pdfDocuments);
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,7 +172,11 @@ export class WebScraperDataProvider {
|
|||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||||
|
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||||
|
|
||||||
let documents = await this.processLinks(links, inProgress);
|
let documents = await this.processLinks(links, inProgress);
|
||||||
|
// documents.push(...pdfDocuments);
|
||||||
return this.cacheAndFinalizeDocuments(documents, links);
|
return this.cacheAndFinalizeDocuments(documents, links);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -220,6 +233,19 @@ export class WebScraperDataProvider {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
|
||||||
|
const checks = links.map(async (link) => ({
|
||||||
|
link,
|
||||||
|
isPdf: await isUrlAPdf({ url: link })
|
||||||
|
}));
|
||||||
|
|
||||||
|
const results = await Promise.all(checks);
|
||||||
|
const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
|
||||||
|
const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
|
||||||
|
|
||||||
|
return [pdfLinks, notPdfLinks];
|
||||||
|
}
|
||||||
|
|
||||||
private applyPathReplacements(documents: Document[]): Document[] {
|
private applyPathReplacements(documents: Document[]): Document[] {
|
||||||
return this.replaceAllPathsWithAbsolutePaths
|
return this.replaceAllPathsWithAbsolutePaths
|
||||||
? replacePathsWithAbsolutePaths(documents)
|
? replacePathsWithAbsolutePaths(documents)
|
||||||
|
@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
|
|||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -66,9 +67,17 @@ export async function scrapWithScrapingBee(
|
|||||||
);
|
);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
const decoder = new TextDecoder();
|
// Check the content type of the response
|
||||||
const text = decoder.decode(response.data);
|
const contentType = response.headers['content-type'];
|
||||||
return text;
|
if (contentType && contentType.includes('application/pdf')) {
|
||||||
|
// Handle PDF content type
|
||||||
|
return fetchAndProcessPdf(url);
|
||||||
|
} else {
|
||||||
|
// Assume the content is text and decode it
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
const text = decoder.decode(response.data);
|
||||||
|
return text;
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error scraping with Scraping Bee: ${error}`);
|
console.error(`Error scraping with Scraping Bee: ${error}`);
|
||||||
return "";
|
return "";
|
||||||
|
@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||||||
} else {
|
} else {
|
||||||
// If the status code is not 200, increment the attempt counter and wait
|
// If the status code is not 200, increment the attempt counter and wait
|
||||||
attempt++;
|
attempt++;
|
||||||
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error fetching result:", error);
|
console.error("Error fetching result:", error.data.detail || '');
|
||||||
attempt++;
|
attempt++;
|
||||||
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||||
// You may want to handle specific errors differently
|
// You may want to handle specific errors differently
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -127,7 +127,10 @@ export async function isUrlAPdf({
|
|||||||
if (fastMode) {
|
if (fastMode) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
const before = Date.now();
|
||||||
const response = await axios.head(url);
|
const response = await axios.head(url);
|
||||||
|
const after = Date.now();
|
||||||
|
console.log(`${after - before}ms - HEAD Request for ${url}`);
|
||||||
const contentType = response.headers['content-type'];
|
const contentType = response.headers['content-type'];
|
||||||
return contentType.includes('application/pdf');
|
return contentType.includes('application/pdf');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
Loading…
Reference in New Issue
Block a user