0

Merge pull request #1 from mendableai/main

Fix FIRECRAWL_API_URL bug, also various PyLint fixes
This commit is contained in:
Matt Joyce 2024-05-23 09:16:03 +10:00 committed by GitHub
commit 96630154d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 1708 additions and 108 deletions

View File

@ -25,6 +25,9 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs: jobs:
pre-deploy: pre-deploy:

View File

@ -31,3 +31,8 @@ POSTHOG_HOST= # set if you'd like to send posthog events like job logs
STRIPE_PRICE_ID_STANDARD= STRIPE_PRICE_ID_STANDARD=
STRIPE_PRICE_ID_SCALE= STRIPE_PRICE_ID_SCALE=
HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta

View File

@ -479,6 +479,16 @@
"format": "uri" "format": "uri"
} }
} }
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
} }
} }
} }

View File

@ -48,6 +48,7 @@
"@bull-board/express": "^5.8.0", "@bull-board/express": "^5.8.0",
"@devil7softwares/pos": "^1.0.2", "@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.13", "@dqbd/tiktoken": "^1.0.13",
"@hyperdx/node-opentelemetry": "^0.7.0",
"@logtail/node": "^0.4.12", "@logtail/node": "^0.4.12",
"@nangohq/node": "^0.36.33", "@nangohq/node": "^0.36.33",
"@sentry/node": "^7.48.0", "@sentry/node": "^7.48.0",

File diff suppressed because it is too large Load Diff

View File

@ -81,7 +81,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.content).toContain("🔥 FireCrawl"); expect(response.body.data.content).toContain("🔥 Firecrawl");
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it("should return a successful response with a valid API key and includeHtml set to true", async () => { it("should return a successful response with a valid API key and includeHtml set to true", async () => {
@ -99,8 +99,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html"); expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain("🔥 FireCrawl"); expect(response.body.data.content).toContain("🔥 Firecrawl");
expect(response.body.data.markdown).toContain("🔥 FireCrawl"); expect(response.body.data.markdown).toContain("🔥 Firecrawl");
expect(response.body.data.html).toContain("<h1"); expect(response.body.data.html).toContain("<h1");
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
@ -266,7 +266,7 @@ describe("E2E Tests for API Routes", () => {
urls.forEach((url: string) => { urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
}); });
}, 60000); // 60 seconds }, 90000); // 90 seconds
it("should return a successful response with a valid API key and limit to 3", async () => { it("should return a successful response with a valid API key and limit to 3", async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)
@ -440,8 +440,8 @@ describe("E2E Tests for API Routes", () => {
// 120 seconds // 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
expect(completedResponse.body.data[0].html).toContain("<h1"); expect(completedResponse.body.data[0].html).toContain("<h1");
}, 60000); }, 60000);
}); });
@ -576,7 +576,7 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
}, 60000); // 60 seconds }, 60000); // 60 seconds
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
@ -697,8 +697,8 @@ describe("E2E Tests for API Routes", () => {
// 120 seconds // 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
expect(completedResponse.body.data[0].html).toContain("<h1"); expect(completedResponse.body.data[0].html).toContain("<h1");
}, 60000); }, 60000);
}); // 60 seconds }); // 60 seconds

View File

@ -4,11 +4,22 @@ import { AuthResponse, RateLimiterMode } from "../../src/types";
import { supabase_service } from "../../src/services/supabase"; import { supabase_service } from "../../src/services/supabase";
import { withAuth } from "../../src/lib/withAuth"; import { withAuth } from "../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible"; import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> { export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode); return withAuth(supaAuthenticateUser)(req, res, mode);
} }
function setTrace(team_id: string, api_key: string) {
try {
setTraceAttributes({
team_id,
api_key
});
} catch (error) {
console.error('Error setting trace attributes:', error);
}
}
export async function supaAuthenticateUser( export async function supaAuthenticateUser(
req, req,
res, res,
@ -78,11 +89,13 @@ export async function supaAuthenticateUser(
status: 401, status: 401,
}; };
} }
const team_id = data[0].team_id;
const plan = getPlanByPriceId(data[0].price_id);
// HyperDX Logging
setTrace(team_id, normalizedApi);
subscriptionData = { subscriptionData = {
team_id: data[0].team_id, team_id: team_id,
plan: getPlanByPriceId(data[0].price_id) plan: plan
} }
switch (mode) { switch (mode) {
case RateLimiterMode.Crawl: case RateLimiterMode.Crawl:

View File

@ -106,6 +106,9 @@ export async function scrapeController(req: Request, res: Response) {
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }
if (extractorOptions.mode === "llm-extraction") {
pageOptions.onlyMainContent = true;
}
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds

View File

@ -5,6 +5,8 @@ import "dotenv/config";
import { getWebScraperQueue } from "./services/queue-service"; import { getWebScraperQueue } from "./services/queue-service";
import { redisClient } from "./services/rate-limiter"; import { redisClient } from "./services/rate-limiter";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import { initSDK } from '@hyperdx/node-opentelemetry';
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express"); const { ExpressAdapter } = require("@bull-board/express");
@ -47,6 +49,11 @@ const DEFAULT_PORT = process.env.PORT ?? 3002;
const HOST = process.env.HOST ?? "localhost"; const HOST = process.env.HOST ?? "localhost";
redisClient.connect(); redisClient.connect();
// HyperDX OpenTelemetry
if(process.env.ENV === 'production') {
initSDK({ consoleCapture: true, additionalInstrumentations: []});
}
export function startServer(port = DEFAULT_PORT) { export function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => { const server = app.listen(Number(port), HOST, () => {

View File

@ -1,25 +1,38 @@
import OpenAI from "openai"; import OpenAI from "openai";
import { Document } from "../../lib/entities"; import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";
export type ScraperCompletionResult = { export type ScraperCompletionResult = {
data: any | null; data: any | null;
url: string; url: string;
}; };
const maxTokens = 32000;
const modifier = 4;
const defaultPrompt = const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage"; "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
// Check if the markdown content exists in the document let markdown = document.markdown;
if (!document.markdown) {
// Check if the markdown content exists in the document
if (!markdown) {
throw new Error( throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
); );
} }
return [{ type: "text", text: document.markdown }]; // count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4");
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier));
}
return [[{ type: "text", text: markdown }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
temperature?: number; temperature?: number;
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const content = prepareOpenAIDoc(document); const [content, numTokens] = prepareOpenAIDoc(document);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,
@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
return { return {
...document, ...document,
llm_extraction: llmExtraction, llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
}; };
} }

View File

@ -72,6 +72,7 @@ export class Document {
}; };
childrenLinks?: string[]; childrenLinks?: string[];
provider?: string; provider?: string;
warning?: string;
constructor(data: Partial<Document>) { constructor(data: Partial<Document>) {
if (!data.content) { if (!data.content) {

View File

@ -0,0 +1,42 @@
import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
const scrapInBatches = async (
urls: string[],
batchSize: number,
delayMs: number
) => {
let successCount = 0;
let errorCount = 0;
for (let i = 0; i < urls.length; i += batchSize) {
const batch = urls
.slice(i, i + batchSize)
.map((url) => scrapWithFireEngine(url));
try {
const results = await Promise.all(batch);
results.forEach((data, index) => {
if (data.trim() === "") {
errorCount++;
} else {
successCount++;
console.log(
`Scraping result ${i + index + 1}:`,
data.trim().substring(0, 20) + "..."
);
}
});
} catch (error) {
console.error("Error during scraping:", error);
}
await delay(delayMs);
}
console.log(`Total successful scrapes: ${successCount}`);
console.log(`Total errored scrapes: ${errorCount}`);
};
function run() {
const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
scrapInBatches(urls, 10, 1000);
}

View File

@ -25,7 +25,7 @@ export class WebCrawler {
initialUrl, initialUrl,
includes, includes,
excludes, excludes,
maxCrawledLinks, maxCrawledLinks = 10000,
limit = 10000, limit = 10000,
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
@ -117,7 +117,7 @@ export class WebCrawler {
const response = await axios.get(this.robotsTxtUrl); const response = await axios.get(this.robotsTxtUrl);
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.robots = robotsParser(this.robotsTxtUrl, response.data);
} catch (error) { } catch (error) {
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
@ -152,7 +152,7 @@ export class WebCrawler {
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => { const queue = async.queue(async (task: string, callback) => {
if (this.crawledUrls.size >= this.maxCrawledLinks) { if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -176,14 +176,14 @@ export class WebCrawler {
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: newUrls[newUrls.length - 1].url, currentDocumentUrl: newUrls[newUrls.length - 1].url,
}); });
} else if (inProgress) { } else if (inProgress) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
@ -324,6 +324,12 @@ export class WebCrawler {
// ".docx", // ".docx",
".xlsx", ".xlsx",
".xml", ".xml",
".avi",
".flv",
".woff",
".ttf",
".woff2",
".webp"
]; ];
return fileExtensions.some((ext) => url.endsWith(ext)); return fileExtensions.some((ext) => url.endsWith(ext));
} }

View File

@ -10,6 +10,15 @@ import { fetchAndProcessPdf } from "./utils/pdfProcessor";
dotenv.config(); dotenv.config();
const baseScrapers = [
"fire-engine",
"scrapingBee",
"playwright",
"scrapingBeeLoad",
"fetch",
] as const;
export async function generateRequestParams( export async function generateRequestParams(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
@ -33,15 +42,39 @@ export async function generateRequestParams(
return defaultParams; return defaultParams;
} }
} }
export async function scrapWithCustomFirecrawl( export async function scrapWithFireEngine(
url: string, url: string,
options?: any options?: any
): Promise<string> { ): Promise<string> {
try { try {
// TODO: merge the custom firecrawl scraper into mono-repo when ready const reqParams = await generateRequestParams(url);
return null; const wait_playwright = reqParams["params"]?.wait ?? 0;
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ url: url, wait: wait_playwright }),
});
if (!response.ok) {
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
const data = await response.json();
const html = data.content;
return html ?? "";
}
} catch (error) { } catch (error) {
console.error(`Error scraping with custom firecrawl-scraper: ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
@ -63,7 +96,7 @@ export async function scrapWithScrapingBee(
if (response.status !== 200 && response.status !== 404) { if (response.status !== 200 && response.status !== 404) {
console.error( console.error(
`Scraping bee error in ${url} with status code ${response.status}` `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
); );
return ""; return "";
} }
@ -77,7 +110,7 @@ export async function scrapWithScrapingBee(
return text; return text;
} }
} catch (error) { } catch (error) {
console.error(`Error scraping with Scraping Bee: ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
@ -97,7 +130,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
if (!response.ok) { if (!response.ok) {
console.error( console.error(
`Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}` `[Playwright] Error fetching url: ${url} with status: ${response.status}`
); );
return ""; return "";
} }
@ -111,11 +144,62 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
return html ?? ""; return html ?? "";
} }
} catch (error) { } catch (error) {
console.error(`Error scraping with Puppeteer: ${error}`); console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
export async function scrapWithFetch(url: string): Promise<string> {
try {
const response = await fetch(url);
if (!response.ok) {
console.error(
`[Fetch] Error fetching url: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
const text = await response.text();
return text;
}
} catch (error) {
console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
return "";
}
}
/**
* Get the order of scrapers to be used for scraping a URL
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
*/
function getScrapingFallbackOrder(defaultScraper?: string) {
const availableScrapers = baseScrapers.filter(scraper => {
switch (scraper) {
case "scrapingBee":
case "scrapingBeeLoad":
return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default:
return true;
}
});
const defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper));
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]);
const scrapersInOrder = Array.from(uniqueScrapers);
return scrapersInOrder as typeof baseScrapers[number][];
}
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
@ -137,17 +221,14 @@ export async function scrapSingleUrl(
const attemptScraping = async ( const attemptScraping = async (
url: string, url: string,
method: method: typeof baseScrapers[number]
| "firecrawl-scraper"
| "scrapingBee"
| "playwright"
| "scrapingBeeLoad"
| "fetch"
) => { ) => {
let text = ""; let text = "";
switch (method) { switch (method) {
case "firecrawl-scraper": case "fire-engine":
text = await scrapWithCustomFirecrawl(url); if (process.env.FIRE_ENGINE_BETA_URL) {
text = await scrapWithFireEngine(url);
}
break; break;
case "scrapingBee": case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
@ -169,25 +250,7 @@ export async function scrapSingleUrl(
} }
break; break;
case "fetch": case "fetch":
try { text = await scrapWithFetch(url);
const response = await fetch(url);
if (!response.ok) {
console.error(
`Error fetching URL: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
text = await response.text();
}
} catch (error) {
console.error(`Error scraping URL: ${error}`);
return "";
}
break; break;
} }
@ -205,15 +268,7 @@ export async function scrapSingleUrl(
console.error(`Invalid URL key, trying: ${urlToScrap}`); console.error(`Invalid URL key, trying: ${urlToScrap}`);
} }
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = defaultScraper const scrapersInOrder = getScrapingFallbackOrder(defaultScraper)
? [
defaultScraper,
"scrapingBee",
"playwright",
"scrapingBeeLoad",
"fetch",
]
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it // If exists text coming from crawler, use it
@ -225,7 +280,10 @@ export async function scrapSingleUrl(
} }
[text, html] = await attemptScraping(urlToScrap, scraper); [text, html] = await attemptScraping(urlToScrap, scraper);
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) break;
console.log(`Falling back to ${scraper}`); const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
}
} }
if (!text) { if (!text) {

View File

@ -63,7 +63,7 @@ export const urlSpecificParams = {
}, },
}, },
"ycombinator.com":{ "ycombinator.com":{
defaultScraper: "playwright", defaultScraper: "fire-engine",
params: { params: {
wait_browser: "networkidle2", wait_browser: "networkidle2",
block_resources: false, block_resources: false,

View File

@ -34,8 +34,6 @@ export const excludeNonMainTags = [
"#nav", "#nav",
".breadcrumbs", ".breadcrumbs",
"#breadcrumbs", "#breadcrumbs",
".form",
"form",
"#search-form", "#search-form",
".search", ".search",
"#search", "#search",
@ -51,10 +49,6 @@ export const excludeNonMainTags = [
"#tag", "#tag",
".category", ".category",
"#category", "#category",
".comment", ".cookie",
"#comment", "#cookie"
".reply",
"#reply",
".author",
"#author",
]; ];

View File

@ -227,10 +227,11 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (creditUsages && creditUsages.length > 0) { if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used; totalCreditsUsed = creditUsages[0].total_credits_used;
console.log("Total Credits Used:", totalCreditsUsed); // console.log("Total Credits Used:", totalCreditsUsed);
} }
} catch (error) { } catch (error) {
console.error("Error calculating credit usage:", error); console.error("Error calculating credit usage:", error);
} }
// Adjust total credits used by subtracting coupon value // Adjust total credits used by subtracting coupon value
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);

View File

@ -5,6 +5,11 @@ import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper"; import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook"; import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from '@hyperdx/node-opentelemetry';
if(process.env.ENV === 'production') {
initSDK({ consoleCapture: true, additionalInstrumentations: []});
}
getWebScraperQueue().process( getWebScraperQueue().process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),

View File

@ -54,7 +54,7 @@ export const testSuiteRateLimiter = new RateLimiterRedis({
export function getRateLimiter(mode: RateLimiterMode, token: string, plan?: string){ export function getRateLimiter(mode: RateLimiterMode, token: string, plan?: string){
// Special test suite case. TODO: Change this later. // Special test suite case. TODO: Change this later.
if (token.includes("5089cefa58")){ if (token.includes("5089cefa58") || token.includes("6254cf9")){
return testSuiteRateLimiter; return testSuiteRateLimiter;
} }
switch (mode) { switch (mode) {

View File

@ -4,10 +4,11 @@ import requests
import time import time
class FirecrawlApp: class FirecrawlApp:
def __init__(self, api_key=None): def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
@ -38,7 +39,7 @@ class FirecrawlApp:
scrape_params[key] = value scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', f'{self.api_url}/v0/scrape',
headers=headers, headers=headers,
json=scrape_params json=scrape_params
) )
@ -48,7 +49,7 @@ class FirecrawlApp:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else: else:
@ -63,7 +64,7 @@ class FirecrawlApp:
if params: if params:
json_data.update(params) json_data.update(params)
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/search', f'{self.api_url}/v0/search',
headers=headers, headers=headers,
json=json_data json=json_data
) )
@ -85,7 +86,7 @@ class FirecrawlApp:
json_data = {'url': url} json_data = {'url': url}
if params: if params:
json_data.update(params) json_data.update(params)
response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
if response.status_code == 200: if response.status_code == 200:
job_id = response.json().get('jobId') job_id = response.json().get('jobId')
if wait_until_done: if wait_until_done:
@ -97,7 +98,7 @@ class FirecrawlApp:
def check_crawl_status(self, job_id): def check_crawl_status(self, job_id):
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
else: else:
@ -130,7 +131,7 @@ class FirecrawlApp:
def _monitor_job_status(self, job_id, headers, timeout): def _monitor_job_status(self, job_id, headers, timeout):
import time import time
while True: while True:
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if status_response.status_code == 200: if status_response.status_code == 200:
status_data = status_response.json() status_data = status_response.json()
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
@ -148,7 +149,7 @@ class FirecrawlApp:
self._handle_error(status_response, 'check crawl status') self._handle_error(status_response, 'check crawl status')
def _handle_error(self, response, action): def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]: if response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else: else:

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.1 Metadata-Version: 2.1
Name: firecrawl-py Name: firecrawl-py
Version: 0.0.8 Version: 0.0.9
Summary: Python SDK for Firecrawl API Summary: Python SDK for Firecrawl API
Home-page: https://github.com/mendableai/firecrawl Home-page: https://github.com/mendableai/firecrawl
Author: Mendable.ai Author: Mendable.ai

View File

@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup( setup(
name='firecrawl-py', name='firecrawl-py',
version='0.0.8', version='0.0.9',
url='https://github.com/mendableai/firecrawl', url='https://github.com/mendableai/firecrawl',
author='Mendable.ai', author='Mendable.ai',
author_email='nick@mendable.ai', author_email='nick@mendable.ai',