Added a html to markdown table parser
This commit is contained in:
parent
a12f4d96a2
commit
ff622739b7
@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { parseTablesToMarkdown } from "./utils/parseTable";
|
||||
// import puppeteer from "puppeteer";
|
||||
|
||||
dotenv.config();
|
||||
@ -132,7 +133,8 @@ export async function scrapSingleUrl(
|
||||
}
|
||||
break;
|
||||
}
|
||||
const cleanedHtml = removeUnwantedElements(text);
|
||||
let cleanedHtml = removeUnwantedElements(text);
|
||||
cleanedHtml = await parseTablesToMarkdown(cleanedHtml);
|
||||
return [await parseMarkdown(cleanedHtml), text];
|
||||
};
|
||||
|
||||
|
66
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
66
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
@ -0,0 +1,66 @@
|
||||
import cheerio from "cheerio";
|
||||
|
||||
export const parseTablesToMarkdown = async (html: string) => {
|
||||
let soup = cheerio.load(html, {
|
||||
xmlMode: true,
|
||||
withStartIndices: true,
|
||||
withEndIndices: true
|
||||
});
|
||||
let tables = soup("table");
|
||||
let replacements = [];
|
||||
|
||||
|
||||
if (tables.length) {
|
||||
for (const table of Array.from(tables)) {
|
||||
const start = table.startIndex;
|
||||
const end = table.endIndex;
|
||||
const markdownTable = await convertTableElementToMarkdown(cheerio.load(table));
|
||||
replacements.push({ start, end, markdownTable });
|
||||
};
|
||||
}
|
||||
|
||||
replacements.sort((a, b) => b.start - a.start);
|
||||
|
||||
let modifiedHtml = html;
|
||||
for (const { start, end, markdownTable } of replacements) {
|
||||
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
|
||||
}
|
||||
|
||||
return modifiedHtml;
|
||||
}
|
||||
|
||||
const convertTableElementToMarkdown = async (tableSoup) => {
|
||||
const rows = [];
|
||||
const trEls = tableSoup("tr");
|
||||
|
||||
trEls.each((i, tr) => {
|
||||
const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i);
|
||||
rows.push(markdownRow);
|
||||
});
|
||||
|
||||
return rows.join('\n');
|
||||
}
|
||||
|
||||
function convertTableRowElementToMarkdown(rowSoup, rowNumber) {
|
||||
const cells = [];
|
||||
const cellEls = rowSoup("td, th");
|
||||
|
||||
cellEls.each((i, cell) => {
|
||||
let cellText = cheerio.load(cell).text();
|
||||
cellText = cellText.replace(/\n/g, " ").trim();
|
||||
cells.push(cellText + ' |');
|
||||
});
|
||||
|
||||
let row = '| ' + cells.join(" ");
|
||||
|
||||
if (rowNumber === 0) {
|
||||
row += '\n' + createMarkdownDividerRow(cellEls.length);
|
||||
}
|
||||
|
||||
return row;
|
||||
}
|
||||
|
||||
function createMarkdownDividerRow(cellCount) {
|
||||
const dividerCells = Array(cellCount).fill('--- |');
|
||||
return '| ' + dividerCells.join(" ");
|
||||
}
|
Loading…
Reference in New Issue
Block a user