From ff622739b74f3ad0b3215dbb7c589a387f0fb882 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Apr 2024 11:01:19 -0300 Subject: [PATCH] Added a html to markdown table parser --- apps/api/src/scraper/WebScraper/single_url.ts | 4 +- .../scraper/WebScraper/utils/parseTable.ts | 66 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/parseTable.ts diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0cdbe51..faba56c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; +import { parseTablesToMarkdown } from "./utils/parseTable"; // import puppeteer from "puppeteer"; dotenv.config(); @@ -132,7 +133,8 @@ export async function scrapSingleUrl( } break; } - const cleanedHtml = removeUnwantedElements(text); + let cleanedHtml = removeUnwantedElements(text); + cleanedHtml = await parseTablesToMarkdown(cleanedHtml); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts new file mode 100644 index 0000000..fdd90a7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -0,0 +1,66 @@ +import cheerio from "cheerio"; + +export const parseTablesToMarkdown = async (html: string) => { + let soup = cheerio.load(html, { + xmlMode: true, + withStartIndices: true, + withEndIndices: true + }); + let tables = soup("table"); + let replacements = []; + + + if (tables.length) { + for (const table of Array.from(tables)) { + const start = table.startIndex; + const end = table.endIndex; + const markdownTable = await convertTableElementToMarkdown(cheerio.load(table)); + replacements.push({ start, end, markdownTable }); + }; + } + + replacements.sort((a, b) => b.start - a.start); + + let modifiedHtml = html; + for (const { start, end, markdownTable } of replacements) { + modifiedHtml = modifiedHtml.slice(0, start) + `
${markdownTable}
` + modifiedHtml.slice(end); + } + + return modifiedHtml; +} + +const convertTableElementToMarkdown = async (tableSoup) => { + const rows = []; + const trEls = tableSoup("tr"); + + trEls.each((i, tr) => { + const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i); + rows.push(markdownRow); + }); + + return rows.join('\n'); +} + +function convertTableRowElementToMarkdown(rowSoup, rowNumber) { + const cells = []; + const cellEls = rowSoup("td, th"); + + cellEls.each((i, cell) => { + let cellText = cheerio.load(cell).text(); + cellText = cellText.replace(/\n/g, " ").trim(); + cells.push(cellText + ' |'); + }); + + let row = '| ' + cells.join(" "); + + if (rowNumber === 0) { + row += '\n' + createMarkdownDividerRow(cellEls.length); + } + + return row; +} + +function createMarkdownDividerRow(cellCount) { + const dividerCells = Array(cellCount).fill('--- |'); + return '| ' + dividerCells.join(" "); +} \ No newline at end of file