0

Added check during scraping to deal with pdfs

Checks if the URL is a PDF during the scraping process (single_url.ts).

TODO: Run integration tests - Does this strat affect the running time?

ps. Some comments need to be removed if we decide to proceed with this strategy.
This commit is contained in:
rafaelsideguide 2024-05-13 09:13:42 -03:00
parent 5a2712fa5a
commit f4348024c6
4 changed files with 49 additions and 15 deletions

View File

@ -117,7 +117,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds }, 60000); // 60 seconds
it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
@ -132,7 +132,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds }, 60000); // 60 seconds
}); });
describe("POST /v0/crawl", () => { describe("POST /v0/crawl", () => {
@ -427,10 +427,8 @@ describe("E2E Tests for API Routes", () => {
.send({ url: "https://jestjs.io" }); .send({ url: "https://jestjs.io" });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
// wait for 30 seconds // wait for 30 seconds
await new Promise((r) => setTimeout(r, 10000)); await new Promise((r) => setTimeout(r, 20000));
const response = await request(TEST_URL) const response = await request(TEST_URL)
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
@ -439,7 +437,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body).toHaveProperty("status"); expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("cancelled"); expect(response.body.status).toBe("cancelled");
await new Promise((r) => setTimeout(r, 20000)); await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL) const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
@ -456,8 +454,6 @@ describe("E2E Tests for API Routes", () => {
}, 60000); // 60 seconds }, 60000); // 60 seconds
describe("POST /v0/scrape with LLM Extraction", () => { describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => { it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)

View File

@ -144,14 +144,23 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress); let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return this.cacheAndFinalizeDocuments(documents, links); return this.cacheAndFinalizeDocuments(documents, links);
} }
private async handleSingleUrlsMode( private async handleSingleUrlsMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let documents = await this.processLinks(this.urls, inProgress); const links = this.urls;
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return documents; return documents;
} }
@ -163,7 +172,11 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }
// let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress); let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return this.cacheAndFinalizeDocuments(documents, links); return this.cacheAndFinalizeDocuments(documents, links);
} }
@ -220,6 +233,19 @@ export class WebScraperDataProvider {
); );
} }
private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
const checks = links.map(async (link) => ({
link,
isPdf: await isUrlAPdf({ url: link })
}));
const results = await Promise.all(checks);
const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
return [pdfLinks, notPdfLinks];
}
private applyPathReplacements(documents: Document[]): Document[] { private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths return this.replaceAllPathsWithAbsolutePaths
? replacePathsWithAbsolutePaths(documents) ? replacePathsWithAbsolutePaths(documents)

View File

@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags"; import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
dotenv.config(); dotenv.config();
@ -66,9 +67,17 @@ export async function scrapWithScrapingBee(
); );
return ""; return "";
} }
const decoder = new TextDecoder(); // Check the content type of the response
const text = decoder.decode(response.data); const contentType = response.headers['content-type'];
return text; if (contentType && contentType.includes('application/pdf')) {
// Handle PDF content type
return fetchAndProcessPdf(url);
} else {
// Assume the content is text and decode it
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
return text;
}
} catch (error) { } catch (error) {
console.error(`Error scraping with Scraping Bee: ${error}`); console.error(`Error scraping with Scraping Bee: ${error}`);
return ""; return "";

View File

@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
} else { } else {
// If the status code is not 200, increment the attempt counter and wait // If the status code is not 200, increment the attempt counter and wait
attempt++; attempt++;
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
} }
} catch (error) { } catch (error) {
console.error("Error fetching result:", error); console.error("Error fetching result:", error.data.detail || '');
attempt++; attempt++;
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently // You may want to handle specific errors differently
} }
} }
@ -127,7 +127,10 @@ export async function isUrlAPdf({
if (fastMode) { if (fastMode) {
return false; return false;
} }
const before = Date.now();
const response = await axios.head(url); const response = await axios.head(url);
const after = Date.now();
console.log(`${after - before}ms - HEAD Request for ${url}`);
const contentType = response.headers['content-type']; const contentType = response.headers['content-type'];
return contentType.includes('application/pdf'); return contentType.includes('application/pdf');
} catch (error) { } catch (error) {