0

Added a html to markdown table parser

This commit is contained in:
rafaelsideguide 2024-04-17 11:01:19 -03:00
parent a12f4d96a2
commit ff622739b7
2 changed files with 69 additions and 1 deletions

View File

@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document } from "../../lib/entities"; import { Document } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { parseTablesToMarkdown } from "./utils/parseTable";
// import puppeteer from "puppeteer"; // import puppeteer from "puppeteer";
dotenv.config(); dotenv.config();
@ -132,7 +133,8 @@ export async function scrapSingleUrl(
} }
break; break;
} }
const cleanedHtml = removeUnwantedElements(text); let cleanedHtml = removeUnwantedElements(text);
cleanedHtml = await parseTablesToMarkdown(cleanedHtml);
return [await parseMarkdown(cleanedHtml), text]; return [await parseMarkdown(cleanedHtml), text];
}; };

View File

@ -0,0 +1,66 @@
import cheerio from "cheerio";
export const parseTablesToMarkdown = async (html: string) => {
let soup = cheerio.load(html, {
xmlMode: true,
withStartIndices: true,
withEndIndices: true
});
let tables = soup("table");
let replacements = [];
if (tables.length) {
for (const table of Array.from(tables)) {
const start = table.startIndex;
const end = table.endIndex;
const markdownTable = await convertTableElementToMarkdown(cheerio.load(table));
replacements.push({ start, end, markdownTable });
};
}
replacements.sort((a, b) => b.start - a.start);
let modifiedHtml = html;
for (const { start, end, markdownTable } of replacements) {
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
}
return modifiedHtml;
}
const convertTableElementToMarkdown = async (tableSoup) => {
const rows = [];
const trEls = tableSoup("tr");
trEls.each((i, tr) => {
const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i);
rows.push(markdownRow);
});
return rows.join('\n');
}
function convertTableRowElementToMarkdown(rowSoup, rowNumber) {
const cells = [];
const cellEls = rowSoup("td, th");
cellEls.each((i, cell) => {
let cellText = cheerio.load(cell).text();
cellText = cellText.replace(/\n/g, " ").trim();
cells.push(cellText + ' |');
});
let row = '| ' + cells.join(" ");
if (rowNumber === 0) {
row += '\n' + createMarkdownDividerRow(cellEls.length);
}
return row;
}
function createMarkdownDividerRow(cellCount) {
const dividerCells = Array(cellCount).fill('--- |');
return '| ' + dividerCells.join(" ");
}