Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
commit
15cfc01f5d
@ -22,6 +22,7 @@ export type WebScraperOptions = {
|
|||||||
maxCrawledLinks?: number;
|
maxCrawledLinks?: number;
|
||||||
limit?: number;
|
limit?: number;
|
||||||
generateImgAltText?: boolean;
|
generateImgAltText?: boolean;
|
||||||
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
};
|
};
|
||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
|
@ -1,179 +0,0 @@
|
|||||||
import { WebScraperDataProvider } from "../index";
|
|
||||||
|
|
||||||
describe("WebScraperDataProvider", () => {
|
|
||||||
describe("replaceImgPathsWithAbsolutePaths", () => {
|
|
||||||
it("should replace image paths with absolute paths", () => {
|
|
||||||
const webScraperDataProvider = new WebScraperDataProvider();
|
|
||||||
const documents = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content: "![alt text](/image.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content: "![another alt text](./another-image.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content: "![another alt text](./another-image.webp)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/data-image" },
|
|
||||||
content: "![data image](data:image/png;base64,...)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const expectedDocuments = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content: "![alt text](https://example.com/image.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content: "![another alt text](https://example.com/another-image.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content: "![another alt text](https://example.com/another-image.webp)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/data-image" },
|
|
||||||
content: "![data image](data:image/png;base64,...)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result =
|
|
||||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should handle absolute URLs without modification", () => {
|
|
||||||
const webScraperDataProvider = new WebScraperDataProvider();
|
|
||||||
const documents = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content: "![alt text](https://example.com/image.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content:
|
|
||||||
"![another alt text](http://anotherexample.com/another-image.png)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const expectedDocuments = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content: "![alt text](https://example.com/image.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content:
|
|
||||||
"![another alt text](http://anotherexample.com/another-image.png)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result =
|
|
||||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should not replace non-image content within the documents", () => {
|
|
||||||
const webScraperDataProvider = new WebScraperDataProvider();
|
|
||||||
const documents = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content:
|
|
||||||
"This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content:
|
|
||||||
"Another test. ![another alt text](./another-image.png) Here is some **bold text**.",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const expectedDocuments = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content:
|
|
||||||
"This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content:
|
|
||||||
"Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result =
|
|
||||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
it("should replace multiple image paths within the documents", () => {
|
|
||||||
const webScraperDataProvider = new WebScraperDataProvider();
|
|
||||||
const documents = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content:
|
|
||||||
"This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content:
|
|
||||||
"Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const expectedDocuments = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page" },
|
|
||||||
content:
|
|
||||||
"This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
|
||||||
content:
|
|
||||||
"Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result =
|
|
||||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should replace image paths within the documents with complex URLs", () => {
|
|
||||||
const webScraperDataProvider = new WebScraperDataProvider();
|
|
||||||
const documents = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page/subpage" },
|
|
||||||
content:
|
|
||||||
"This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
|
||||||
content:
|
|
||||||
"Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const expectedDocuments = [
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/page/subpage" },
|
|
||||||
content:
|
|
||||||
"This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
|
||||||
content:
|
|
||||||
"Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result =
|
|
||||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
@ -6,6 +6,7 @@ import { WebCrawler } from "./crawler";
|
|||||||
import { getValue, setValue } from "../../services/redis";
|
import { getValue, setValue } from "../../services/redis";
|
||||||
import { getImageDescription } from "./utils/gptVision";
|
import { getImageDescription } from "./utils/gptVision";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
|
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
||||||
|
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
@ -19,6 +20,7 @@ export class WebScraperDataProvider {
|
|||||||
private concurrentRequests: number = 20;
|
private concurrentRequests: number = 20;
|
||||||
private generateImgAltText: boolean = false;
|
private generateImgAltText: boolean = false;
|
||||||
private pageOptions?: PageOptions;
|
private pageOptions?: PageOptions;
|
||||||
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -100,7 +102,13 @@ export class WebScraperDataProvider {
|
|||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||||
|
documents = replacePathsWithAbsolutePaths(documents);
|
||||||
|
} else {
|
||||||
|
documents = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
}
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -164,7 +172,13 @@ export class WebScraperDataProvider {
|
|||||||
this.urls.filter((link) => !link.endsWith(".pdf")),
|
this.urls.filter((link) => !link.endsWith(".pdf")),
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||||
|
documents = replacePathsWithAbsolutePaths(documents);
|
||||||
|
} else {
|
||||||
|
documents = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
}
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -197,7 +211,13 @@ export class WebScraperDataProvider {
|
|||||||
);
|
);
|
||||||
|
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||||
|
documents = replacePathsWithAbsolutePaths(documents);
|
||||||
|
} else {
|
||||||
|
documents = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
}
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -351,6 +371,7 @@ export class WebScraperDataProvider {
|
|||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
||||||
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
@ -436,40 +457,4 @@ export class WebScraperDataProvider {
|
|||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
};
|
};
|
||||||
|
|
||||||
replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
|
||||||
try {
|
|
||||||
documents.forEach((document) => {
|
|
||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
|
||||||
const images =
|
|
||||||
document.content.match(
|
|
||||||
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
|
|
||||||
) || [];
|
|
||||||
|
|
||||||
images.forEach((image: string) => {
|
|
||||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
|
||||||
let altText = image.match(/\[(.*?)\]/)[1];
|
|
||||||
|
|
||||||
if (!imageUrl.startsWith("data:image")) {
|
|
||||||
if (!imageUrl.startsWith("http")) {
|
|
||||||
if (imageUrl.startsWith("/")) {
|
|
||||||
imageUrl = imageUrl.substring(1);
|
|
||||||
}
|
|
||||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
document.content = document.content.replace(
|
|
||||||
image,
|
|
||||||
`![${altText}](${imageUrl})`
|
|
||||||
);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
return documents;
|
|
||||||
} catch (error) {
|
|
||||||
console.error("Error replacing img paths with absolute paths", error);
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
@ -1,40 +1,47 @@
|
|||||||
import * as pdfProcessor from '../pdfProcessor';
|
import * as pdfProcessor from '../pdfProcessor';
|
||||||
|
|
||||||
describe('PDF Processing Module - Integration Test', () => {
|
describe('PDF Processing Module - Integration Test', () => {
|
||||||
it('should download and read a simple PDF file by URL', async () => {
|
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||||
|
delete process.env.LLAMAPARSE_API_KEY;
|
||||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
||||||
expect(pdfContent).toEqual("Dummy PDF file");
|
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should download and read a complex PDF file by URL', async () => {
|
// We're hitting the LLAMAPARSE rate limit 🫠
|
||||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf');
|
// it('should download and read a simple PDF file by URL', async () => {
|
||||||
|
// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
||||||
|
// expect(pdfContent).toEqual("Dummy PDF file");
|
||||||
|
// });
|
||||||
|
|
||||||
const expectedContent = 'A Comprehensive Overview of Large Language Models\n' +
|
// it('should download and read a complex PDF file by URL', async () => {
|
||||||
' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' +
|
// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf');
|
||||||
' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' +
|
|
||||||
' Nick Barnes h, Ajmal Mian i\n' +
|
// const expectedContent = 'A Comprehensive Overview of Large Language Models\n' +
|
||||||
' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' +
|
// ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' +
|
||||||
' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' +
|
// ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' +
|
||||||
' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' +
|
// ' Nick Barnes h, Ajmal Mian i\n' +
|
||||||
' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' +
|
// ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' +
|
||||||
' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' +
|
// ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' +
|
||||||
' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' +
|
// ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' +
|
||||||
' gThe University of Melbourne (UoM), Melbourne, Australia\n' +
|
// ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' +
|
||||||
' hAustralian National University (ANU), Canberra, Australia\n' +
|
// ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' +
|
||||||
' iThe University of Western Australia (UWA), Perth, Australia\n' +
|
// ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' +
|
||||||
' Abstract\n' +
|
// ' gThe University of Melbourne (UoM), Melbourne, Australia\n' +
|
||||||
' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' +
|
// ' hAustralian National University (ANU), Canberra, Australia\n' +
|
||||||
' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' +
|
// ' iThe University of Western Australia (UWA), Perth, Australia\n' +
|
||||||
' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' +
|
// ' Abstract\n' +
|
||||||
' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' +
|
// ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' +
|
||||||
' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' +
|
// ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' +
|
||||||
' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' +
|
// ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' +
|
||||||
' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' +
|
// ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' +
|
||||||
' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' +
|
// ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' +
|
||||||
' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' +
|
// ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' +
|
||||||
' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' +
|
// ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' +
|
||||||
' extensive informative summaries of the existing works to advance the LLM research.\n'
|
// ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' +
|
||||||
expect(pdfContent).toContain(expectedContent);
|
// ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' +
|
||||||
}, 60000);
|
// ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' +
|
||||||
|
// ' extensive informative summaries of the existing works to advance the LLM research.\n'
|
||||||
|
// expect(pdfContent).toContain(expectedContent);
|
||||||
|
// }, 60000);
|
||||||
|
|
||||||
});
|
});
|
@ -0,0 +1,114 @@
|
|||||||
|
import { Document } from "../../../../lib/entities";
|
||||||
|
import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
|
||||||
|
|
||||||
|
describe('replacePaths', () => {
|
||||||
|
describe('replacePathsWithAbsolutePaths', () => {
|
||||||
|
it('should replace relative paths with absolute paths', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const expectedDocuments: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not alter absolute URLs', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(documents); // Expect no change
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not alter data URLs for images', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'This is an image: ![alt text]().'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(documents); // Expect no change
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle multiple links and images correctly', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const expectedDocuments: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should correctly handle a mix of absolute and relative paths', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const expectedDocuments: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('replaceImgPathsWithAbsolutePaths', () => {
|
||||||
|
it('should replace relative image paths with absolute paths', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const expectedDocuments: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not alter data:image URLs', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'An image with a data URL: ![alt text]().'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(documents); // Expect no change
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle multiple images with a mix of data and relative URLs', () => {
|
||||||
|
const documents: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const expectedDocuments: Document[] = [{
|
||||||
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
|
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).'
|
||||||
|
}];
|
||||||
|
|
||||||
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
80
apps/api/src/scraper/WebScraper/utils/replacePaths.ts
Normal file
80
apps/api/src/scraper/WebScraper/utils/replacePaths.ts
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import { Document } from "../../../lib/entities";
|
||||||
|
|
||||||
|
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||||
|
try {
|
||||||
|
documents.forEach((document) => {
|
||||||
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
|
const paths =
|
||||||
|
document.content.match(
|
||||||
|
/(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g
|
||||||
|
) || [];
|
||||||
|
|
||||||
|
paths.forEach((path: string) => {
|
||||||
|
const isImage = path.startsWith("!");
|
||||||
|
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
||||||
|
let url = matchedUrl[1];
|
||||||
|
|
||||||
|
if (!url.startsWith("data:") && !url.startsWith("http")) {
|
||||||
|
if (url.startsWith("/")) {
|
||||||
|
url = url.substring(1);
|
||||||
|
}
|
||||||
|
url = new URL(url, baseUrl).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
||||||
|
if (isImage) {
|
||||||
|
document.content = document.content.replace(
|
||||||
|
path,
|
||||||
|
`${markdownLinkOrImageText}(${url})`
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
document.content = document.content.replace(
|
||||||
|
path,
|
||||||
|
`${markdownLinkOrImageText}(${url})`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return documents;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error replacing paths with absolute paths", error);
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||||
|
try {
|
||||||
|
documents.forEach((document) => {
|
||||||
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
|
const images =
|
||||||
|
document.content.match(
|
||||||
|
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
|
||||||
|
) || [];
|
||||||
|
|
||||||
|
images.forEach((image: string) => {
|
||||||
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||||
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
|
|
||||||
|
if (!imageUrl.startsWith("data:image")) {
|
||||||
|
if (!imageUrl.startsWith("http")) {
|
||||||
|
if (imageUrl.startsWith("/")) {
|
||||||
|
imageUrl = imageUrl.substring(1);
|
||||||
|
}
|
||||||
|
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document.content = document.content.replace(
|
||||||
|
image,
|
||||||
|
`![${altText}](${imageUrl})`
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return documents;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error replacing img paths with absolute paths", error);
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
};
|
@ -3,7 +3,6 @@ import { getWebScraperQueue } from "./queue-service";
|
|||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
import { logtail } from "./logtail";
|
import { logtail } from "./logtail";
|
||||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
|
||||||
import { callWebhook } from "./webhook";
|
import { callWebhook } from "./webhook";
|
||||||
|
|
||||||
getWebScraperQueue().process(
|
getWebScraperQueue().process(
|
||||||
|
@ -10,13 +10,26 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
import dotenv from 'dotenv';
|
import dotenv from 'dotenv';
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
/**
|
||||||
|
* Main class for interacting with the Firecrawl API.
|
||||||
|
*/
|
||||||
export default class FirecrawlApp {
|
export default class FirecrawlApp {
|
||||||
|
/**
|
||||||
|
* Initializes a new instance of the FirecrawlApp class.
|
||||||
|
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
||||||
|
*/
|
||||||
constructor({ apiKey = null }) {
|
constructor({ apiKey = null }) {
|
||||||
this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
|
this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
|
||||||
if (!this.apiKey) {
|
if (!this.apiKey) {
|
||||||
throw new Error('No API key provided');
|
throw new Error('No API key provided');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Scrapes a URL using the Firecrawl API.
|
||||||
|
* @param {string} url - The URL to scrape.
|
||||||
|
* @param {Params | null} params - Additional parameters for the scrape request.
|
||||||
|
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
||||||
|
*/
|
||||||
scrapeUrl(url_1) {
|
scrapeUrl(url_1) {
|
||||||
return __awaiter(this, arguments, void 0, function* (url, params = null) {
|
return __awaiter(this, arguments, void 0, function* (url, params = null) {
|
||||||
const headers = {
|
const headers = {
|
||||||
@ -32,7 +45,7 @@ export default class FirecrawlApp {
|
|||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
const responseData = response.data;
|
const responseData = response.data;
|
||||||
if (responseData.success) {
|
if (responseData.success) {
|
||||||
return responseData.data;
|
return responseData;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
||||||
@ -45,8 +58,17 @@ export default class FirecrawlApp {
|
|||||||
catch (error) {
|
catch (error) {
|
||||||
throw new Error(error.message);
|
throw new Error(error.message);
|
||||||
}
|
}
|
||||||
|
return { success: false, error: 'Internal server error.' };
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||||
|
* @param {string} url - The URL to crawl.
|
||||||
|
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||||
|
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||||
|
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||||
|
* @returns {Promise<CrawlResponse>} The response from the crawl operation.
|
||||||
|
*/
|
||||||
crawlUrl(url_1) {
|
crawlUrl(url_1) {
|
||||||
return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) {
|
return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) {
|
||||||
const headers = this.prepareHeaders();
|
const headers = this.prepareHeaders();
|
||||||
@ -62,7 +84,7 @@ export default class FirecrawlApp {
|
|||||||
return this.monitorJobStatus(jobId, headers, timeout);
|
return this.monitorJobStatus(jobId, headers, timeout);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return { jobId };
|
return { success: true, jobId };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -73,8 +95,14 @@ export default class FirecrawlApp {
|
|||||||
console.log(error);
|
console.log(error);
|
||||||
throw new Error(error.message);
|
throw new Error(error.message);
|
||||||
}
|
}
|
||||||
|
return { success: false, error: 'Internal server error.' };
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Checks the status of a crawl job using the Firecrawl API.
|
||||||
|
* @param {string} jobId - The job ID of the crawl operation.
|
||||||
|
* @returns {Promise<JobStatusResponse>} The response containing the job status.
|
||||||
|
*/
|
||||||
checkCrawlStatus(jobId) {
|
checkCrawlStatus(jobId) {
|
||||||
return __awaiter(this, void 0, void 0, function* () {
|
return __awaiter(this, void 0, void 0, function* () {
|
||||||
const headers = this.prepareHeaders();
|
const headers = this.prepareHeaders();
|
||||||
@ -90,20 +118,45 @@ export default class FirecrawlApp {
|
|||||||
catch (error) {
|
catch (error) {
|
||||||
throw new Error(error.message);
|
throw new Error(error.message);
|
||||||
}
|
}
|
||||||
|
return { success: false, status: 'unknown', error: 'Internal server error.' };
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Prepares the headers for an API request.
|
||||||
|
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||||
|
*/
|
||||||
prepareHeaders() {
|
prepareHeaders() {
|
||||||
return {
|
return {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': `Bearer ${this.apiKey}`,
|
'Authorization': `Bearer ${this.apiKey}`,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Sends a POST request to the specified URL.
|
||||||
|
* @param {string} url - The URL to send the request to.
|
||||||
|
* @param {Params} data - The data to send in the request.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @returns {Promise<AxiosResponse>} The response from the POST request.
|
||||||
|
*/
|
||||||
postRequest(url, data, headers) {
|
postRequest(url, data, headers) {
|
||||||
return axios.post(url, data, { headers });
|
return axios.post(url, data, { headers });
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Sends a GET request to the specified URL.
|
||||||
|
* @param {string} url - The URL to send the request to.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @returns {Promise<AxiosResponse>} The response from the GET request.
|
||||||
|
*/
|
||||||
getRequest(url, headers) {
|
getRequest(url, headers) {
|
||||||
return axios.get(url, { headers });
|
return axios.get(url, { headers });
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Monitors the status of a crawl job until completion or failure.
|
||||||
|
* @param {string} jobId - The job ID of the crawl operation.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||||
|
* @returns {Promise<any>} The final job status or data.
|
||||||
|
*/
|
||||||
monitorJobStatus(jobId, headers, timeout) {
|
monitorJobStatus(jobId, headers, timeout) {
|
||||||
return __awaiter(this, void 0, void 0, function* () {
|
return __awaiter(this, void 0, void 0, function* () {
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -134,6 +187,11 @@ export default class FirecrawlApp {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Handles errors from API responses.
|
||||||
|
* @param {AxiosResponse} response - The response from the API.
|
||||||
|
* @param {string} action - The action being performed when the error occurred.
|
||||||
|
*/
|
||||||
handleError(response, action) {
|
handleError(response, action) {
|
||||||
if ([402, 409, 500].includes(response.status)) {
|
if ([402, 409, 500].includes(response.status)) {
|
||||||
const errorMessage = response.data.error || 'Unknown error occurred';
|
const errorMessage = response.data.error || 'Unknown error occurred';
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "0.0.9",
|
"version": "0.0.10",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/index.js",
|
"main": "build/index.js",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
|
@ -2,17 +2,60 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios';
|
|||||||
import dotenv from 'dotenv';
|
import dotenv from 'dotenv';
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
interface FirecrawlAppConfig {
|
/**
|
||||||
|
* Configuration interface for FirecrawlApp.
|
||||||
|
*/
|
||||||
|
export interface FirecrawlAppConfig {
|
||||||
apiKey?: string | null;
|
apiKey?: string | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface Params {
|
/**
|
||||||
|
* Generic parameter interface.
|
||||||
|
*/
|
||||||
|
export interface Params {
|
||||||
[key: string]: any;
|
[key: string]: any;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Response interface for scraping operations.
|
||||||
|
*/
|
||||||
|
export interface ScrapeResponse {
|
||||||
|
success: boolean;
|
||||||
|
data?: any;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Response interface for crawling operations.
|
||||||
|
*/
|
||||||
|
export interface CrawlResponse {
|
||||||
|
success: boolean;
|
||||||
|
jobId?: string;
|
||||||
|
data?: any;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Response interface for job status checks.
|
||||||
|
*/
|
||||||
|
export interface JobStatusResponse {
|
||||||
|
success: boolean;
|
||||||
|
status: string;
|
||||||
|
jobId?: string;
|
||||||
|
data?: any;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main class for interacting with the Firecrawl API.
|
||||||
|
*/
|
||||||
export default class FirecrawlApp {
|
export default class FirecrawlApp {
|
||||||
private apiKey: string;
|
private apiKey: string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializes a new instance of the FirecrawlApp class.
|
||||||
|
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
||||||
|
*/
|
||||||
constructor({ apiKey = null }: FirecrawlAppConfig) {
|
constructor({ apiKey = null }: FirecrawlAppConfig) {
|
||||||
this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
|
this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
|
||||||
if (!this.apiKey) {
|
if (!this.apiKey) {
|
||||||
@ -20,7 +63,13 @@ export default class FirecrawlApp {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapeUrl(url: string, params: Params | null = null): Promise<any> {
|
/**
|
||||||
|
* Scrapes a URL using the Firecrawl API.
|
||||||
|
* @param {string} url - The URL to scrape.
|
||||||
|
* @param {Params | null} params - Additional parameters for the scrape request.
|
||||||
|
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
||||||
|
*/
|
||||||
|
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
|
||||||
const headers: AxiosRequestHeaders = {
|
const headers: AxiosRequestHeaders = {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': `Bearer ${this.apiKey}`,
|
'Authorization': `Bearer ${this.apiKey}`,
|
||||||
@ -34,7 +83,7 @@ export default class FirecrawlApp {
|
|||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
const responseData = response.data;
|
const responseData = response.data;
|
||||||
if (responseData.success) {
|
if (responseData.success) {
|
||||||
return responseData.data;
|
return responseData;
|
||||||
} else {
|
} else {
|
||||||
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
||||||
}
|
}
|
||||||
@ -44,9 +93,18 @@ export default class FirecrawlApp {
|
|||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
throw new Error(error.message);
|
throw new Error(error.message);
|
||||||
}
|
}
|
||||||
|
return { success: false, error: 'Internal server error.' };
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise<any> {
|
/**
|
||||||
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||||
|
* @param {string} url - The URL to crawl.
|
||||||
|
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||||
|
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||||
|
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||||
|
* @returns {Promise<CrawlResponse>} The response from the crawl operation.
|
||||||
|
*/
|
||||||
|
async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise<CrawlResponse> {
|
||||||
const headers = this.prepareHeaders();
|
const headers = this.prepareHeaders();
|
||||||
let jsonData: Params = { url };
|
let jsonData: Params = { url };
|
||||||
if (params) {
|
if (params) {
|
||||||
@ -59,7 +117,7 @@ export default class FirecrawlApp {
|
|||||||
if (waitUntilDone) {
|
if (waitUntilDone) {
|
||||||
return this.monitorJobStatus(jobId, headers, timeout);
|
return this.monitorJobStatus(jobId, headers, timeout);
|
||||||
} else {
|
} else {
|
||||||
return { jobId };
|
return { success: true, jobId };
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
this.handleError(response, 'start crawl job');
|
this.handleError(response, 'start crawl job');
|
||||||
@ -68,9 +126,15 @@ export default class FirecrawlApp {
|
|||||||
console.log(error)
|
console.log(error)
|
||||||
throw new Error(error.message);
|
throw new Error(error.message);
|
||||||
}
|
}
|
||||||
|
return { success: false, error: 'Internal server error.' };
|
||||||
}
|
}
|
||||||
|
|
||||||
async checkCrawlStatus(jobId: string): Promise<any> {
|
/**
|
||||||
|
* Checks the status of a crawl job using the Firecrawl API.
|
||||||
|
* @param {string} jobId - The job ID of the crawl operation.
|
||||||
|
* @returns {Promise<JobStatusResponse>} The response containing the job status.
|
||||||
|
*/
|
||||||
|
async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> {
|
||||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||||
try {
|
try {
|
||||||
const response: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
|
const response: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
|
||||||
@ -82,8 +146,13 @@ export default class FirecrawlApp {
|
|||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
throw new Error(error.message);
|
throw new Error(error.message);
|
||||||
}
|
}
|
||||||
|
return { success: false, status: 'unknown', error: 'Internal server error.' };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepares the headers for an API request.
|
||||||
|
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||||
|
*/
|
||||||
prepareHeaders(): AxiosRequestHeaders {
|
prepareHeaders(): AxiosRequestHeaders {
|
||||||
return {
|
return {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
@ -91,14 +160,34 @@ export default class FirecrawlApp {
|
|||||||
} as AxiosRequestHeaders;
|
} as AxiosRequestHeaders;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sends a POST request to the specified URL.
|
||||||
|
* @param {string} url - The URL to send the request to.
|
||||||
|
* @param {Params} data - The data to send in the request.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @returns {Promise<AxiosResponse>} The response from the POST request.
|
||||||
|
*/
|
||||||
postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
|
postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
|
||||||
return axios.post(url, data, { headers });
|
return axios.post(url, data, { headers });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sends a GET request to the specified URL.
|
||||||
|
* @param {string} url - The URL to send the request to.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @returns {Promise<AxiosResponse>} The response from the GET request.
|
||||||
|
*/
|
||||||
getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
|
getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
|
||||||
return axios.get(url, { headers });
|
return axios.get(url, { headers });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monitors the status of a crawl job until completion or failure.
|
||||||
|
* @param {string} jobId - The job ID of the crawl operation.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||||
|
* @returns {Promise<any>} The final job status or data.
|
||||||
|
*/
|
||||||
async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise<any> {
|
async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise<any> {
|
||||||
while (true) {
|
while (true) {
|
||||||
const statusResponse: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
|
const statusResponse: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
|
||||||
@ -124,6 +213,11 @@ export default class FirecrawlApp {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handles errors from API responses.
|
||||||
|
* @param {AxiosResponse} response - The response from the API.
|
||||||
|
* @param {string} action - The action being performed when the error occurred.
|
||||||
|
*/
|
||||||
handleError(response: AxiosResponse, action: string): void {
|
handleError(response: AxiosResponse, action: string): void {
|
||||||
if ([402, 409, 500].includes(response.status)) {
|
if ([402, 409, 500].includes(response.status)) {
|
||||||
const errorMessage: string = response.data.error || 'Unknown error occurred';
|
const errorMessage: string = response.data.error || 'Unknown error occurred';
|
||||||
|
Loading…
Reference in New Issue
Block a user