v-firecrawl/apps/api/src/controllers/scrape.ts

import { ExtractorOptions, PageOptions } from './../lib/entities';
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers';

export async function scrapeHelper(
  req: Request,
  team_id: string,
  crawlerOptions: any,
  pageOptions: PageOptions,
  extractorOptions: ExtractorOptions,
  includeHtml: boolean = false
): Promise<{
  success: boolean;
  error?: string;
  data?: Document;
  returnCode: number;
}> {
  const url = req.body.url;
  if (!url) {
    return { success: false, error: "Url is required", returnCode: 400 };
  }

  if (isUrlBlocked(url)) {
    return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
  }


  const a = new WebScraperDataProvider();
  await a.setOptions({
    mode: "single_urls",
    urls: [url],
    crawlerOptions: {
      ...crawlerOptions,
    },
    pageOptions: pageOptions,
    extractorOptions: extractorOptions,
    includeHtml: includeHtml
  });

  const docs = await a.getDocuments(false);
  // make sure doc.content is not empty
  const filteredDocs = docs.filter(
    (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
  );
  if (filteredDocs.length === 0) {
    return { success: true, error: "No page found", returnCode: 200 };
  }


  let creditsToBeBilled =  filteredDocs.length;
  const creditsPerLLMExtract = 5;

  if (extractorOptions.mode === "llm-extraction"){
    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
  }

  const billingResult = await billTeam(
    team_id,
    creditsToBeBilled
  );
  if (!billingResult.success) {
    return {
      success: false,
      error:
        "Failed to bill team. Insufficient credits or subscription not found.",
      returnCode: 402,
    };
  }

  return {
    success: true,
    data: filteredDocs[0],
    returnCode: 200,
  };
}

export async function scrapeController(req: Request, res: Response) {
  try {
    // make sure to authenticate user first, Bearer <token>
    const { success, team_id, error, status } = await authenticateUser(
      req,
      res,
      RateLimiterMode.Scrape
    );
    if (!success) {
      return res.status(status).json({ error });
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
    const extractorOptions = req.body.extractorOptions ?? {
      mode: "markdown"
    }
    const origin = req.body.origin ?? "api";
    const includeHtml = req.body.includeHtml ?? false;

    try {
      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
        await checkTeamCredits(team_id, 1);
      if (!creditsCheckSuccess) {
        return res.status(402).json({ error: "Insufficient credits" });
      }
    } catch (error) {
      console.error(error);
      return res.status(500).json({ error: "Internal server error" });
    }
    const startTime = new Date().getTime();
    const result = await scrapeHelper(
      req,
      team_id,
      crawlerOptions,
      pageOptions,
      extractorOptions,
      includeHtml
    );
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
    const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;

    logJob({
      success: result.success,
      message: result.error,
      num_docs: 1,
      docs: [result.data],
      time_taken: timeTakenInSeconds,
      team_id: team_id,
      mode: "scrape",
      url: req.body.url,
      crawlerOptions: crawlerOptions,
      pageOptions: pageOptions,
      origin: origin, 
      extractor_options: extractorOptions,
      num_tokens: numTokens,
      includeHtml: includeHtml
    });
    return res.status(result.returnCode).json(result);
  } catch (error) {
    console.error(error);
    return res.status(500).json({ error: error.message });
  }
}
Added toMarkdown option 2024-05-06 10:36:44 -04:00			`import { ExtractorOptions, PageOptions } from './../lib/entities';`
Nick: 2024-04-20 19:38:05 -04:00			`import { Request, Response } from "express";`
Nick: 2024-04-20 22:04:27 -04:00			`import { WebScraperDataProvider } from "../scraper/WebScraper";`
			`import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";`
Nick: 2024-04-20 19:38:05 -04:00			`import { authenticateUser } from "./auth";`
Nick: 2024-04-20 22:04:27 -04:00			`import { RateLimiterMode } from "../types";`
			`import { logJob } from "../services/logging/log_job";`
			`import { Document } from "../lib/entities";`
[Feat] Added blocklist for social media urls 2024-04-23 17:50:35 -04:00			`import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function`
Caleb: trying to get loggin workng 2024-04-30 12:20:15 -04:00			`import { numTokensFromString } from '../lib/LLM-extraction/helpers';`
Nick: 2024-04-20 19:38:05 -04:00
			`export async function scrapeHelper(`
			`req: Request,`
			`team_id: string,`
			`crawlerOptions: any,`
Added toMarkdown option 2024-05-06 10:36:44 -04:00			`pageOptions: PageOptions,`
changed to `includeHtml` 2024-05-06 18:45:56 -04:00			`extractorOptions: ExtractorOptions,`
			`includeHtml: boolean = false`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`): Promise<{`
			`success: boolean;`
			`error?: string;`
			`data?: Document;`
Nick: 2024-04-20 22:04:27 -04:00			`returnCode: number;`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`}> {`
Nick: 2024-04-20 19:38:05 -04:00			`const url = req.body.url;`
			`if (!url) {`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`return { success: false, error: "Url is required", returnCode: 400 };`
Nick: 2024-04-20 19:38:05 -04:00			`}`

[Feat] Added blocklist for social media urls 2024-04-23 17:50:35 -04:00			`if (isUrlBlocked(url)) {`
Nick: 2024-04-23 19:47:24 -04:00			`return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };`
[Feat] Added blocklist for social media urls 2024-04-23 17:50:35 -04:00			`}`

Caleb: added ajv json schema validation. 2024-04-29 15:12:55 -04:00
Nick: 2024-04-20 19:38:05 -04:00			`const a = new WebScraperDataProvider();`
			`await a.setOptions({`
			`mode: "single_urls",`
			`urls: [url],`
			`crawlerOptions: {`
			`...crawlerOptions,`
			`},`
			`pageOptions: pageOptions,`
changed to `includeHtml` 2024-05-06 18:45:56 -04:00			`extractorOptions: extractorOptions,`
			`includeHtml: includeHtml`
Nick: 2024-04-20 19:38:05 -04:00			`});`

			`const docs = await a.getDocuments(false);`
			`// make sure doc.content is not empty`
			`const filteredDocs = docs.filter(`
			`(doc: { content?: string }) => doc.content && doc.content.trim().length > 0`
			`);`
			`if (filteredDocs.length === 0) {`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`return { success: true, error: "No page found", returnCode: 200 };`
Nick: 2024-04-20 19:38:05 -04:00			`}`
Caleb: first version of supabase proxy to make db authentication optional 2024-04-21 12:31:22 -04:00
Caleb: trying to get loggin workng 2024-04-30 12:20:15 -04:00
			`let creditsToBeBilled = filteredDocs.length;`
Update scrape.ts 2024-04-30 19:19:59 -04:00			`const creditsPerLLMExtract = 5;`
Caleb: trying to get loggin workng 2024-04-30 12:20:15 -04:00
			`if (extractorOptions.mode === "llm-extraction"){`
			`creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)`
			`}`

almost finished 2024-04-26 10:42:49 -04:00			`const billingResult = await billTeam(`
			`team_id,`
Caleb: trying to get loggin workng 2024-04-30 12:20:15 -04:00			`creditsToBeBilled`
almost finished 2024-04-26 10:42:49 -04:00			`);`
			`if (!billingResult.success) {`
			`return {`
			`success: false,`
			`error:`
			`"Failed to bill team. Insufficient credits or subscription not found.",`
			`returnCode: 402,`
			`};`
			`}`
Caleb: first version of supabase proxy to make db authentication optional 2024-04-21 12:31:22 -04:00
Nick: 2024-04-20 19:38:05 -04:00			`return {`
			`success: true,`
			`data: filteredDocs[0],`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`returnCode: 200,`
Nick: 2024-04-20 19:38:05 -04:00			`};`
			`}`

			`export async function scrapeController(req: Request, res: Response) {`
			`try {`
			`// make sure to authenticate user first, Bearer <token>`
			`const { success, team_id, error, status } = await authenticateUser(`
			`req,`
			`res,`
			`RateLimiterMode.Scrape`
			`);`
			`if (!success) {`
			`return res.status(status).json({ error });`
			`}`
			`const crawlerOptions = req.body.crawlerOptions ?? {};`
changed to `includeHtml` 2024-05-06 18:45:56 -04:00			`const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };`
Caleb: got it to a testable state I believe 2024-04-28 18:52:09 -04:00			`const extractorOptions = req.body.extractorOptions ?? {`
			`mode: "markdown"`
			`}`
Nick: 2024-04-20 22:37:45 -04:00			`const origin = req.body.origin ?? "api";`
changed to `includeHtml` 2024-05-06 18:45:56 -04:00			`const includeHtml = req.body.includeHtml ?? false;`
Nick: 2024-04-20 19:38:05 -04:00
			`try {`
			`const { success: creditsCheckSuccess, message: creditsCheckMessage } =`
			`await checkTeamCredits(team_id, 1);`
			`if (!creditsCheckSuccess) {`
			`return res.status(402).json({ error: "Insufficient credits" });`
			`}`
			`} catch (error) {`
			`console.error(error);`
			`return res.status(500).json({ error: "Internal server error" });`
			`}`
Nick: 2024-04-20 22:37:45 -04:00			`const startTime = new Date().getTime();`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`const result = await scrapeHelper(`
			`req,`
			`team_id,`
			`crawlerOptions,`
Caleb: got it to a testable state I believe 2024-04-28 18:52:09 -04:00			`pageOptions,`
changed to `includeHtml` 2024-05-06 18:45:56 -04:00			`extractorOptions,`
			`includeHtml`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`);`
Nick: 2024-04-20 22:37:45 -04:00			`const endTime = new Date().getTime();`
			`const timeTakenInSeconds = (endTime - startTime) / 1000;`
Update scrape.ts 2024-04-30 19:35:44 -04:00			`const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;`
Caleb: trying to get loggin workng 2024-04-30 12:20:15 -04:00
Update scrape.ts 2024-04-20 21:55:39 -04:00			`logJob({`
			`success: result.success,`
			`message: result.error,`
			`num_docs: 1,`
			`docs: [result.data],`
Nick: 2024-04-20 22:37:45 -04:00			`time_taken: timeTakenInSeconds,`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`team_id: team_id,`
			`mode: "scrape",`
			`url: req.body.url,`
			`crawlerOptions: crawlerOptions,`
			`pageOptions: pageOptions,`
Caleb: trying to get loggin workng 2024-04-30 12:20:15 -04:00			`origin: origin,`
			`extractor_options: extractorOptions,`
changed to `includeHtml` 2024-05-06 18:45:56 -04:00			`num_tokens: numTokens,`
			`includeHtml: includeHtml`
Update scrape.ts 2024-04-20 21:55:39 -04:00			`});`
Nick: 2024-04-20 22:04:27 -04:00			`return res.status(result.returnCode).json(result);`
Nick: 2024-04-20 19:38:05 -04:00			`} catch (error) {`
			`console.error(error);`
			`return res.status(500).json({ error: error.message });`
			`}`
			`}`