Cleaned up
This commit is contained in:
parent
f4348024c6
commit
8eb2e95f19
@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { getValue, setValue } from "../../services/redis";
|
||||
import { getImageDescription } from "./utils/imageDescription";
|
||||
import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
import {
|
||||
replaceImgPathsWithAbsolutePaths,
|
||||
replacePathsWithAbsolutePaths,
|
||||
@ -144,11 +144,7 @@ export class WebScraperDataProvider {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
// documents.push(...pdfDocuments);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
@ -156,11 +152,8 @@ export class WebScraperDataProvider {
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
const links = this.urls;
|
||||
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
// documents.push(...pdfDocuments);
|
||||
return documents;
|
||||
}
|
||||
|
||||
@ -172,11 +165,7 @@ export class WebScraperDataProvider {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
// let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
// documents.push(...pdfDocuments);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
@ -233,19 +222,6 @@ export class WebScraperDataProvider {
|
||||
);
|
||||
}
|
||||
|
||||
private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
|
||||
const checks = links.map(async (link) => ({
|
||||
link,
|
||||
isPdf: await isUrlAPdf({ url: link })
|
||||
}));
|
||||
|
||||
const results = await Promise.all(checks);
|
||||
const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
|
||||
const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
|
||||
|
||||
return [pdfLinks, notPdfLinks];
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
return this.replaceAllPathsWithAbsolutePaths
|
||||
? replacePathsWithAbsolutePaths(documents)
|
||||
|
@ -67,13 +67,11 @@ export async function scrapWithScrapingBee(
|
||||
);
|
||||
return "";
|
||||
}
|
||||
// Check the content type of the response
|
||||
|
||||
const contentType = response.headers['content-type'];
|
||||
if (contentType && contentType.includes('application/pdf')) {
|
||||
// Handle PDF content type
|
||||
return fetchAndProcessPdf(url);
|
||||
} else {
|
||||
// Assume the content is text and decode it
|
||||
const decoder = new TextDecoder();
|
||||
const text = decoder.decode(response.data);
|
||||
return text;
|
||||
@ -104,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||
return "";
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const html = data.content;
|
||||
return html ?? "";
|
||||
const contentType = response.headers['content-type'];
|
||||
if (contentType && contentType.includes('application/pdf')) {
|
||||
return fetchAndProcessPdf(url);
|
||||
} else {
|
||||
const data = await response.json();
|
||||
const html = data.content;
|
||||
return html ?? "";
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error scraping with Puppeteer: ${error}`);
|
||||
return "";
|
||||
@ -173,7 +176,13 @@ export async function scrapSingleUrl(
|
||||
);
|
||||
return "";
|
||||
}
|
||||
text = await response.text();
|
||||
|
||||
const contentType = response.headers['content-type'];
|
||||
if (contentType && contentType.includes('application/pdf')) {
|
||||
return fetchAndProcessPdf(url);
|
||||
} else {
|
||||
text = await response.text();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error scraping URL: ${error}`);
|
||||
return "";
|
||||
|
@ -105,36 +105,4 @@ async function processPdf(file: string) {
|
||||
const fileContent = fs.readFileSync(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
}
|
||||
/**
|
||||
* Check if a url is a pdf
|
||||
* @param url The url to check
|
||||
* @param fastMode If true, the function will return false if the url is does not end with .pdf
|
||||
* @returns A promise that resolves to true if the url is a pdf, false otherwise
|
||||
*/
|
||||
export async function isUrlAPdf({
|
||||
url,
|
||||
fastMode = false,
|
||||
}: {
|
||||
url: string;
|
||||
fastMode?: boolean;
|
||||
}): Promise<boolean> {
|
||||
try {
|
||||
if (url.endsWith(".pdf")) {
|
||||
return true;
|
||||
}
|
||||
// If fast mode is enabled, we skip the HEAD request and return false
|
||||
if (fastMode) {
|
||||
return false;
|
||||
}
|
||||
const before = Date.now();
|
||||
const response = await axios.head(url);
|
||||
const after = Date.now();
|
||||
console.log(`${after - before}ms - HEAD Request for ${url}`);
|
||||
const contentType = response.headers['content-type'];
|
||||
return contentType.includes('application/pdf');
|
||||
} catch (error) {
|
||||
// console.error("Error making HEAD request:", error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user