Added metadata.pageStatusCode and metadata.pageError properties to the responses
This commit is contained in:
parent
d48c0df6c5
commit
bb859ae9a7
@ -492,7 +492,7 @@
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
@ -507,9 +507,126 @@
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"keywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"robots": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogTitle": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogUrl": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"nullable": true
|
||||
},
|
||||
"ogImage": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogAudio": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDeterminer": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocale": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocaleAlternate": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"nullable": true
|
||||
},
|
||||
"ogSiteName": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogVideo": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDateCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDate": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsAudience": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsKeywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"modifiedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"publishedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleTag": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleSection": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -558,9 +675,126 @@
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"keywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"robots": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogTitle": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogUrl": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"nullable": true
|
||||
},
|
||||
"ogImage": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogAudio": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDeterminer": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocale": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocaleAlternate": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"nullable": true
|
||||
},
|
||||
"ogSiteName": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogVideo": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDateCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDate": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsAudience": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsKeywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"modifiedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"publishedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleTag": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleSection": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -83,6 +83,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.content).toContain("_Roast_");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
|
||||
@ -103,6 +105,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body.data.content).toContain("_Roast_");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
@ -118,6 +122,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
@ -133,6 +139,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||
@ -155,6 +163,23 @@ describe("E2E Tests for API Routes", () => {
|
||||
// expect(response.body.data.content).toContain("🔥 Firecrawl");
|
||||
// expect(duration).toBeGreaterThanOrEqual(7000);
|
||||
// }, 12000); // 12 seconds timeout
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://mendable.ai/alshdiasuhdasd' });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('Mendable');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||
expect(response.body.data.metadata.pageError).toBe("Not Found");
|
||||
}, 60000); // 60 seconds
|
||||
});
|
||||
|
||||
describe("POST /v0/crawl", () => {
|
||||
@ -270,6 +295,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||
@ -351,6 +378,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||
@ -383,6 +412,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
@ -481,6 +512,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
|
||||
// 120 seconds
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||
@ -488,6 +521,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0].content).toContain("_Roast_");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000);
|
||||
});
|
||||
|
||||
@ -622,6 +657,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
|
||||
const childrenLinks = completedResponse.body.data.filter(doc =>
|
||||
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||
@ -665,6 +702,9 @@ describe("E2E Tests for API Routes", () => {
|
||||
})
|
||||
])
|
||||
);
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 120000); // 120 seconds
|
||||
|
||||
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||
@ -700,6 +740,9 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
@ -759,6 +802,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0].content).toContain("_Roast_");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000);
|
||||
}); // 60 seconds
|
||||
|
||||
@ -802,6 +847,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
|
||||
const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
|
||||
return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||
@ -842,7 +889,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
|
||||
|
||||
expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||
@ -997,6 +1045,10 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(statusResponse.body).toHaveProperty("data");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
|
||||
const results = statusResponse.body.data;
|
||||
// results.forEach((result, i) => {
|
||||
// console.log(result.metadata.sourceURL);
|
||||
|
@ -119,4 +119,7 @@ export class SearchResult {
|
||||
export interface FireEngineResponse {
|
||||
html: string;
|
||||
screenshot: string;
|
||||
}
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
}
|
||||
|
||||
|
@ -224,7 +224,7 @@ export class WebCrawler {
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
const normalizedUrl = this.normalizeCrawlUrl(url);
|
||||
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
@ -244,20 +244,27 @@ export class WebCrawler {
|
||||
|
||||
try {
|
||||
let content: string = "";
|
||||
let pageStatusCode: number;
|
||||
let pageError: string | undefined = undefined;
|
||||
|
||||
// If it is the first link, fetch with single url
|
||||
if (this.visited.size === 1) {
|
||||
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
||||
content = page.html ?? "";
|
||||
pageStatusCode = page.metadata?.pageStatusCode;
|
||||
pageError = page.metadata?.pageError || undefined;
|
||||
} else {
|
||||
const response = await axios.get(url);
|
||||
content = response.data ?? "";
|
||||
pageStatusCode = response.status;
|
||||
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
||||
}
|
||||
const $ = load(content);
|
||||
let links: { url: string, html: string }[] = [];
|
||||
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
||||
|
||||
// Add the initial URL to the list of links
|
||||
if (this.visited.size === 1) {
|
||||
links.push({ url, html: content });
|
||||
links.push({ url, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
|
||||
$("a").each((_, element) => {
|
||||
@ -279,7 +286,7 @@ export class WebCrawler {
|
||||
!this.matchesExcludes(path) &&
|
||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content });
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -241,7 +241,7 @@ export class WebScraperDataProvider {
|
||||
content: "",
|
||||
html: this.pageOptions?.includeHtml ? "" : undefined,
|
||||
markdown: "",
|
||||
metadata: { sourceURL: url },
|
||||
metadata: { sourceURL: url, pageStatusCode: 200 },
|
||||
}));
|
||||
}
|
||||
|
||||
@ -280,10 +280,10 @@ export class WebScraperDataProvider {
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink);
|
||||
return {
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
content: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
@ -292,10 +292,10 @@ export class WebScraperDataProvider {
|
||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (p) => {
|
||||
const docXDocument = await fetchAndProcessDocx(p);
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
|
||||
return {
|
||||
content: docXDocument,
|
||||
metadata: { sourceURL: p },
|
||||
content,
|
||||
metadata: { sourceURL: p, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
|
@ -88,12 +88,13 @@ export async function scrapWithFireEngine(
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url);
|
||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
||||
} else {
|
||||
const data = response.data;
|
||||
const html = data.content;
|
||||
const screenshot = data.screenshot;
|
||||
return { html: html ?? "", screenshot: screenshot ?? "" };
|
||||
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error };
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === 'ECONNABORTED') {
|
||||
@ -109,35 +110,39 @@ export async function scrapWithScrapingBee(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = universalTimeout
|
||||
): Promise<string> {
|
||||
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||
const clientParams = await generateRequestParams(
|
||||
url,
|
||||
wait_browser,
|
||||
timeout
|
||||
timeout,
|
||||
);
|
||||
|
||||
const response = await client.get(clientParams);
|
||||
|
||||
if (response.status !== 200 && response.status !== 404) {
|
||||
console.error(
|
||||
`[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
|
||||
);
|
||||
return "";
|
||||
}
|
||||
const response = await client.get({
|
||||
...clientParams,
|
||||
params: {
|
||||
...clientParams.params,
|
||||
'transparent_status_code': 'True'
|
||||
}
|
||||
});
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
return await fetchAndProcessPdf(url);
|
||||
} else {
|
||||
const decoder = new TextDecoder();
|
||||
const text = decoder.decode(response.data);
|
||||
return text;
|
||||
let text = "";
|
||||
try {
|
||||
const decoder = new TextDecoder();
|
||||
text = decoder.decode(response.data);
|
||||
} catch (decodeError) {
|
||||
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
|
||||
}
|
||||
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
||||
return "";
|
||||
return { content: "" };
|
||||
}
|
||||
}
|
||||
|
||||
@ -145,7 +150,7 @@ export async function scrapWithPlaywright(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
headers?: Record<string, string>
|
||||
): Promise<string> {
|
||||
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
// If the user has passed a wait parameter in the request, use that
|
||||
@ -167,21 +172,21 @@ export async function scrapWithPlaywright(
|
||||
console.error(
|
||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
return "";
|
||||
return { content: "" };
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
return await fetchAndProcessPdf(url);
|
||||
} else {
|
||||
const textData = response.data;
|
||||
try {
|
||||
const data = JSON.parse(textData);
|
||||
const html = data.content;
|
||||
return html ?? "";
|
||||
return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
|
||||
} catch (jsonError) {
|
||||
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
|
||||
return "";
|
||||
return { content: "" };
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
@ -190,11 +195,11 @@ export async function scrapWithPlaywright(
|
||||
} else {
|
||||
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
||||
}
|
||||
return "";
|
||||
return { content: "" };
|
||||
}
|
||||
}
|
||||
|
||||
export async function scrapWithFetch(url: string): Promise<string> {
|
||||
export async function scrapWithFetch(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
@ -208,15 +213,15 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
||||
console.error(
|
||||
`[Axios] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
return "";
|
||||
return { content: "", pageStatusCode: response.status, pageError: response.statusText };
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
return await fetchAndProcessPdf(url);
|
||||
} else {
|
||||
const text = response.data;
|
||||
return text;
|
||||
return { content: text, pageStatusCode: 200 };
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === 'ECONNABORTED') {
|
||||
@ -224,7 +229,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
||||
} else {
|
||||
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
||||
}
|
||||
return "";
|
||||
return { content: "" };
|
||||
}
|
||||
}
|
||||
|
||||
@ -317,7 +322,7 @@ export async function scrapSingleUrl(
|
||||
url: string,
|
||||
method: (typeof baseScrapers)[number]
|
||||
) => {
|
||||
let text = "";
|
||||
let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} };
|
||||
let screenshot = "";
|
||||
switch (method) {
|
||||
case "fire-engine":
|
||||
@ -329,38 +334,52 @@ export async function scrapSingleUrl(
|
||||
pageOptions.screenshot,
|
||||
pageOptions.headers
|
||||
);
|
||||
text = response.html;
|
||||
screenshot = response.screenshot;
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "scrapingBee":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
text = await scrapWithScrapingBee(
|
||||
const response = await scrapWithScrapingBee(
|
||||
url,
|
||||
"domcontentloaded",
|
||||
pageOptions.fallback === false ? 7000 : 15000
|
||||
);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "playwright":
|
||||
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||
text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
|
||||
const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "scrapingBeeLoad":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
text = await scrapWithScrapingBee(url, "networkidle2");
|
||||
const response = await scrapWithScrapingBee(url, "networkidle2");
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "fetch":
|
||||
text = await scrapWithFetch(url);
|
||||
const response = await scrapWithFetch(url);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
break;
|
||||
}
|
||||
|
||||
let customScrapedContent : FireEngineResponse | null = null;
|
||||
|
||||
// Check for custom scraping conditions
|
||||
const customScraperResult = await handleCustomScraping(text, url);
|
||||
const customScraperResult = await handleCustomScraping(scraperResponse.text, url);
|
||||
|
||||
if (customScraperResult){
|
||||
switch (customScraperResult.scraper) {
|
||||
@ -371,23 +390,30 @@ export async function scrapSingleUrl(
|
||||
}
|
||||
break;
|
||||
case "pdf":
|
||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url);
|
||||
customScrapedContent = { html: content, screenshot, pageStatusCode, pageError }
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (customScrapedContent) {
|
||||
text = customScrapedContent.html;
|
||||
scraperResponse.text = customScrapedContent.html;
|
||||
screenshot = customScrapedContent.screenshot;
|
||||
}
|
||||
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
|
||||
return [await parseMarkdown(cleanedHtml), text, screenshot];
|
||||
return {
|
||||
text: await parseMarkdown(cleanedHtml),
|
||||
html: scraperResponse.text,
|
||||
screenshot: scraperResponse.screenshot,
|
||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||
pageError: scraperResponse.metadata.pageError || undefined
|
||||
};
|
||||
};
|
||||
try {
|
||||
let [text, html, screenshot] = ["", "", ""];
|
||||
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
||||
let urlKey = urlToScrap;
|
||||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
@ -410,7 +436,14 @@ export async function scrapSingleUrl(
|
||||
html = existingHtml;
|
||||
break;
|
||||
}
|
||||
[text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
|
||||
|
||||
const attempt = await attemptScraping(urlToScrap, scraper);
|
||||
text = attempt.text ?? '';
|
||||
html = attempt.html ?? '';
|
||||
screenshot = attempt.screenshot ?? '';
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
pageError = attempt.pageError;
|
||||
|
||||
if (text && text.trim().length >= 100) break;
|
||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
if (nextScraperIndex < scrapersInOrder.length) {
|
||||
@ -435,6 +468,8 @@ export async function scrapSingleUrl(
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError
|
||||
},
|
||||
};
|
||||
} else {
|
||||
@ -442,7 +477,12 @@ export async function scrapSingleUrl(
|
||||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||
metadata: {
|
||||
...metadata,
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -3,11 +3,13 @@ import * as docxProcessor from "../docxProcessor";
|
||||
describe("DOCX Processing Module - Integration Test", () => {
|
||||
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const docxContent = await docxProcessor.fetchAndProcessDocx(
|
||||
const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
|
||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
||||
);
|
||||
expect(docxContent.trim()).toContain(
|
||||
expect(content.trim()).toContain(
|
||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
||||
);
|
||||
expect(pageStatusCode).toBe(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
@ -3,8 +3,10 @@ import * as pdfProcessor from '../pdfProcessor';
|
||||
describe('PDF Processing Module - Integration Test', () => {
|
||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
||||
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
||||
expect(content.trim()).toEqual("Dummy PDF file");
|
||||
expect(pageStatusCode).toEqual(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
});
|
||||
|
||||
// We're hitting the LLAMAPARSE rate limit 🫠
|
||||
|
@ -5,14 +5,14 @@ import path from "path";
|
||||
import os from "os";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function fetchAndProcessDocx(url: string): Promise<string> {
|
||||
const tempFilePath = await downloadDocx(url);
|
||||
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
|
||||
const content = await processDocxToText(tempFilePath);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return content;
|
||||
return { content, pageStatusCode, pageError };
|
||||
}
|
||||
|
||||
async function downloadDocx(url: string): Promise<string> {
|
||||
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
@ -25,7 +25,7 @@ async function downloadDocx(url: string): Promise<string> {
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve(tempFilePath));
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
@ -29,6 +29,9 @@ interface Metadata {
|
||||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
}
|
||||
|
||||
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
@ -61,6 +64,9 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
let publishedTime: string | null = null;
|
||||
let articleTag: string | null = null;
|
||||
let articleSection: string | null = null;
|
||||
let sourceURL: string | null = null;
|
||||
let pageStatusCode: number | null = null;
|
||||
let pageError: string | null = null;
|
||||
|
||||
try {
|
||||
title = soup("title").text() || null;
|
||||
@ -132,5 +138,8 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
...(publishedTime ? { publishedTime } : {}),
|
||||
...(articleTag ? { articleTag } : {}),
|
||||
...(articleSection ? { articleSection } : {}),
|
||||
...(sourceURL ? { sourceURL } : {}),
|
||||
...(pageStatusCode ? { pageStatusCode } : {}),
|
||||
...(pageError ? { pageError } : {}),
|
||||
};
|
||||
}
|
||||
|
@ -9,14 +9,14 @@ import os from "os";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fetchAndProcessPdf(url: string): Promise<string> {
|
||||
const tempFilePath = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath);
|
||||
export async function fetchAndProcessPdf(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return content;
|
||||
return { content, pageStatusCode, pageError };
|
||||
}
|
||||
|
||||
async function downloadPdf(url: string): Promise<string> {
|
||||
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
@ -29,7 +29,7 @@ async function downloadPdf(url: string): Promise<string> {
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve(tempFilePath));
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
63
apps/playwright-service/get_error.py
Normal file
63
apps/playwright-service/get_error.py
Normal file
@ -0,0 +1,63 @@
|
||||
def get_error(status_code: int) -> str:
|
||||
error_messages = {
|
||||
300: "Multiple Choices",
|
||||
301: "Moved Permanently",
|
||||
302: "Found",
|
||||
303: "See Other",
|
||||
304: "Not Modified",
|
||||
305: "Use Proxy",
|
||||
307: "Temporary Redirect",
|
||||
308: "Permanent Redirect",
|
||||
309: "Resume Incomplete",
|
||||
310: "Too Many Redirects",
|
||||
311: "Unavailable For Legal Reasons",
|
||||
312: "Previously Used",
|
||||
313: "I'm Used",
|
||||
314: "Switch Proxy",
|
||||
315: "Temporary Redirect",
|
||||
316: "Resume Incomplete",
|
||||
317: "Too Many Redirects",
|
||||
400: "Bad Request",
|
||||
401: "Unauthorized",
|
||||
403: "Forbidden",
|
||||
404: "Not Found",
|
||||
405: "Method Not Allowed",
|
||||
406: "Not Acceptable",
|
||||
407: "Proxy Authentication Required",
|
||||
408: "Request Timeout",
|
||||
409: "Conflict",
|
||||
410: "Gone",
|
||||
411: "Length Required",
|
||||
412: "Precondition Failed",
|
||||
413: "Payload Too Large",
|
||||
414: "URI Too Long",
|
||||
415: "Unsupported Media Type",
|
||||
416: "Range Not Satisfiable",
|
||||
417: "Expectation Failed",
|
||||
418: "I'm a teapot",
|
||||
421: "Misdirected Request",
|
||||
422: "Unprocessable Entity",
|
||||
423: "Locked",
|
||||
424: "Failed Dependency",
|
||||
425: "Too Early",
|
||||
426: "Upgrade Required",
|
||||
428: "Precondition Required",
|
||||
429: "Too Many Requests",
|
||||
431: "Request Header Fields Too Large",
|
||||
451: "Unavailable For Legal Reasons",
|
||||
500: "Internal Server Error",
|
||||
501: "Not Implemented",
|
||||
502: "Bad Gateway",
|
||||
503: "Service Unavailable",
|
||||
504: "Gateway Timeout",
|
||||
505: "HTTP Version Not Supported",
|
||||
506: "Variant Also Negotiates",
|
||||
507: "Insufficient Storage",
|
||||
508: "Loop Detected",
|
||||
510: "Not Extended",
|
||||
511: "Network Authentication Required",
|
||||
599: "Network Connect Timeout Error"
|
||||
}
|
||||
if status_code < 300:
|
||||
return None
|
||||
return error_messages.get(status_code, "Unknown Error")
|
@ -9,6 +9,7 @@ from fastapi import FastAPI
|
||||
from fastapi.responses import JSONResponse
|
||||
from playwright.async_api import Browser, async_playwright
|
||||
from pydantic import BaseModel
|
||||
from get_error import get_error
|
||||
|
||||
PROXY_SERVER = environ.get("PROXY_SERVER", None)
|
||||
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
|
||||
@ -73,16 +74,22 @@ async def root(body: UrlModel):
|
||||
if body.headers:
|
||||
await page.set_extra_http_headers(body.headers)
|
||||
|
||||
await page.goto(
|
||||
response = await page.goto(
|
||||
body.url,
|
||||
wait_until="load",
|
||||
timeout=body.timeout,
|
||||
)
|
||||
page_status_code = response.status
|
||||
page_error = get_error(page_status_code)
|
||||
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
|
||||
if body.wait_after_load > 0:
|
||||
await page.wait_for_timeout(body.wait_after_load)
|
||||
|
||||
page_content = await page.content()
|
||||
await context.close()
|
||||
json_compatible_item_data = {"content": page_content}
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
json_compatible_item_data = {
|
||||
"content": page_content,
|
||||
"pageStatusCode": page_status_code,
|
||||
"pageError": page_error
|
||||
}
|
||||
return JSONResponse(content=json_compatible_item_data)
|
Loading…
x
Reference in New Issue
Block a user