Added a html to markdown table parser
This commit is contained in:
parent
a12f4d96a2
commit
ff622739b7
@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata";
|
|||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document } from "../../lib/entities";
|
import { Document } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
|
import { parseTablesToMarkdown } from "./utils/parseTable";
|
||||||
// import puppeteer from "puppeteer";
|
// import puppeteer from "puppeteer";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@ -132,7 +133,8 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
const cleanedHtml = removeUnwantedElements(text);
|
let cleanedHtml = removeUnwantedElements(text);
|
||||||
|
cleanedHtml = await parseTablesToMarkdown(cleanedHtml);
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
66
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
66
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
import cheerio from "cheerio";
|
||||||
|
|
||||||
|
export const parseTablesToMarkdown = async (html: string) => {
|
||||||
|
let soup = cheerio.load(html, {
|
||||||
|
xmlMode: true,
|
||||||
|
withStartIndices: true,
|
||||||
|
withEndIndices: true
|
||||||
|
});
|
||||||
|
let tables = soup("table");
|
||||||
|
let replacements = [];
|
||||||
|
|
||||||
|
|
||||||
|
if (tables.length) {
|
||||||
|
for (const table of Array.from(tables)) {
|
||||||
|
const start = table.startIndex;
|
||||||
|
const end = table.endIndex;
|
||||||
|
const markdownTable = await convertTableElementToMarkdown(cheerio.load(table));
|
||||||
|
replacements.push({ start, end, markdownTable });
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
replacements.sort((a, b) => b.start - a.start);
|
||||||
|
|
||||||
|
let modifiedHtml = html;
|
||||||
|
for (const { start, end, markdownTable } of replacements) {
|
||||||
|
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
|
||||||
|
}
|
||||||
|
|
||||||
|
return modifiedHtml;
|
||||||
|
}
|
||||||
|
|
||||||
|
const convertTableElementToMarkdown = async (tableSoup) => {
|
||||||
|
const rows = [];
|
||||||
|
const trEls = tableSoup("tr");
|
||||||
|
|
||||||
|
trEls.each((i, tr) => {
|
||||||
|
const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i);
|
||||||
|
rows.push(markdownRow);
|
||||||
|
});
|
||||||
|
|
||||||
|
return rows.join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertTableRowElementToMarkdown(rowSoup, rowNumber) {
|
||||||
|
const cells = [];
|
||||||
|
const cellEls = rowSoup("td, th");
|
||||||
|
|
||||||
|
cellEls.each((i, cell) => {
|
||||||
|
let cellText = cheerio.load(cell).text();
|
||||||
|
cellText = cellText.replace(/\n/g, " ").trim();
|
||||||
|
cells.push(cellText + ' |');
|
||||||
|
});
|
||||||
|
|
||||||
|
let row = '| ' + cells.join(" ");
|
||||||
|
|
||||||
|
if (rowNumber === 0) {
|
||||||
|
row += '\n' + createMarkdownDividerRow(cellEls.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
return row;
|
||||||
|
}
|
||||||
|
|
||||||
|
function createMarkdownDividerRow(cellCount) {
|
||||||
|
const dividerCells = Array(cellCount).fill('--- |');
|
||||||
|
return '| ' + dividerCells.join(" ");
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user