From ff622739b74f3ad0b3215dbb7c589a387f0fb882 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Apr 2024 11:01:19 -0300 Subject: [PATCH] Added a html to markdown table parser --- apps/api/src/scraper/WebScraper/single_url.ts | 4 +- .../scraper/WebScraper/utils/parseTable.ts | 66 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/parseTable.ts diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0cdbe51..faba56c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; +import { parseTablesToMarkdown } from "./utils/parseTable"; // import puppeteer from "puppeteer"; dotenv.config(); @@ -132,7 +133,8 @@ export async function scrapSingleUrl( } break; } - const cleanedHtml = removeUnwantedElements(text); + let cleanedHtml = removeUnwantedElements(text); + cleanedHtml = await parseTablesToMarkdown(cleanedHtml); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts new file mode 100644 index 0000000..fdd90a7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -0,0 +1,66 @@ +import cheerio from "cheerio"; + +export const parseTablesToMarkdown = async (html: string) => { + let soup = cheerio.load(html, { + xmlMode: true, + withStartIndices: true, + withEndIndices: true + }); + let tables = soup("table"); + let replacements = []; + + + if (tables.length) { + for (const table of Array.from(tables)) { + const start = table.startIndex; + const end = table.endIndex; + const markdownTable = await convertTableElementToMarkdown(cheerio.load(table)); + replacements.push({ start, end, markdownTable }); + }; + } + + replacements.sort((a, b) => b.start - a.start); + + let modifiedHtml = html; + for (const { start, end, markdownTable } of replacements) { + modifiedHtml = modifiedHtml.slice(0, start) + `