Merge pull request #11 from mendableai/feat/parse-to-markdown-tables
[Feat] Added html to markdown table parser
This commit is contained in:
commit
460763ba5f
@ -82,6 +82,7 @@
|
|||||||
"scrapingbee": "^1.7.4",
|
"scrapingbee": "^1.7.4",
|
||||||
"stripe": "^12.2.0",
|
"stripe": "^12.2.0",
|
||||||
"turndown": "^7.1.3",
|
"turndown": "^7.1.3",
|
||||||
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
"typesense": "^1.5.4",
|
"typesense": "^1.5.4",
|
||||||
"unstructured-client": "^0.9.4",
|
"unstructured-client": "^0.9.4",
|
||||||
"uuid": "^9.0.1",
|
"uuid": "^9.0.1",
|
||||||
|
@ -134,6 +134,9 @@ dependencies:
|
|||||||
turndown:
|
turndown:
|
||||||
specifier: ^7.1.3
|
specifier: ^7.1.3
|
||||||
version: 7.1.3
|
version: 7.1.3
|
||||||
|
turndown-plugin-gfm:
|
||||||
|
specifier: ^1.0.2
|
||||||
|
version: 1.0.2
|
||||||
typesense:
|
typesense:
|
||||||
specifier: ^1.5.4
|
specifier: ^1.5.4
|
||||||
version: 1.7.2(@babel/runtime@7.24.0)
|
version: 1.7.2(@babel/runtime@7.24.0)
|
||||||
@ -5783,6 +5786,10 @@ packages:
|
|||||||
resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==}
|
resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==}
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/turndown-plugin-gfm@1.0.2:
|
||||||
|
resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/turndown@7.1.3:
|
/turndown@7.1.3:
|
||||||
resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==}
|
resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==}
|
||||||
dependencies:
|
dependencies:
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
export function parseMarkdown(html: string) {
|
export function parseMarkdown(html: string) {
|
||||||
var TurndownService = require("turndown");
|
var TurndownService = require("turndown");
|
||||||
|
var turndownPluginGfm = require("turndown-plugin-gfm");
|
||||||
|
|
||||||
const turndownService = new TurndownService();
|
const turndownService = new TurndownService();
|
||||||
turndownService.addRule("inlineLink", {
|
turndownService.addRule("inlineLink", {
|
||||||
@ -16,7 +17,8 @@ export function parseMarkdown(html: string) {
|
|||||||
return "[" + content.trim() + "](" + href + title + ")\n";
|
return "[" + content.trim() + "](" + href + title + ")\n";
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
var gfm = turndownPluginGfm.gfm;
|
||||||
|
turndownService.use(gfm);
|
||||||
let markdownContent = turndownService.turndown(html);
|
let markdownContent = turndownService.turndown(html);
|
||||||
|
|
||||||
// multiple line links
|
// multiple line links
|
||||||
|
@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata";
|
|||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document } from "../../lib/entities";
|
import { Document } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
|
import { parseTablesToMarkdown } from "./utils/parseTable";
|
||||||
// import puppeteer from "puppeteer";
|
// import puppeteer from "puppeteer";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@ -132,7 +133,7 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
const cleanedHtml = removeUnwantedElements(text);
|
let cleanedHtml = removeUnwantedElements(text);
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -0,0 +1,128 @@
|
|||||||
|
import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
|
||||||
|
import cheerio from 'cheerio';
|
||||||
|
|
||||||
|
describe('parseTablesToMarkdown', () => {
|
||||||
|
it('converts a simple HTML table to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||||
|
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||||
|
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with a single row to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||||
|
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |</div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with a single column to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
<tr><th>Header 1</th></tr>
|
||||||
|
<tr><td>Row 1 Col 1</td></tr>
|
||||||
|
<tr><td>Row 2 Col 1</td></tr>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |</div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with a single cell to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
<tr><th>Header 1</th></tr>
|
||||||
|
<tr><td>Row 1 Col 1</td></tr>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |</div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with no header to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||||
|
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div>| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with no rows to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div></div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with no cells to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
<tr></tr>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div></div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with no columns to Markdown', async () => {
|
||||||
|
const html = `
|
||||||
|
<table>
|
||||||
|
<tr><th></th></tr>
|
||||||
|
</table>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div></div>`;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table with no table to Markdown', async () => {
|
||||||
|
const html = ``;
|
||||||
|
const expectedMarkdown = ``;
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts a table inside of a bunch of html noise', async () => {
|
||||||
|
const html = `
|
||||||
|
<div>
|
||||||
|
<p>Some text before</p>
|
||||||
|
<table>
|
||||||
|
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||||
|
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||||
|
</table>
|
||||||
|
<p>Some text after</p>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
const expectedMarkdown = `<div>
|
||||||
|
<p>Some text before</p>
|
||||||
|
<div>| Row 1 Col 1 | Row 1 Col 2 |
|
||||||
|
| Row 2 Col 1 | Row 2 Col 2 |</div>
|
||||||
|
<p>Some text after</p>
|
||||||
|
</div>`;
|
||||||
|
|
||||||
|
const markdown = await parseTablesToMarkdown(html);
|
||||||
|
expect(markdown).toBe(expectedMarkdown);
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
74
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
74
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
import cheerio, { CheerioAPI } from "cheerio";
|
||||||
|
|
||||||
|
interface Replacement {
|
||||||
|
start: number;
|
||||||
|
end: number;
|
||||||
|
markdownTable: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const parseTablesToMarkdown = async (html: string): Promise<string> => {
|
||||||
|
const soup: CheerioAPI = cheerio.load(html, {
|
||||||
|
xmlMode: true,
|
||||||
|
withStartIndices: true,
|
||||||
|
withEndIndices: true
|
||||||
|
});
|
||||||
|
let tables = soup("table");
|
||||||
|
let replacements: Replacement[] = [];
|
||||||
|
|
||||||
|
if (tables.length) {
|
||||||
|
tables.each((_, tableElement) => {
|
||||||
|
const start: number = tableElement.startIndex;
|
||||||
|
const end: number = tableElement.endIndex + 1; // Include the closing tag properly
|
||||||
|
let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
|
||||||
|
const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
|
||||||
|
if (isTableEmpty) {
|
||||||
|
markdownTable = '';
|
||||||
|
}
|
||||||
|
replacements.push({ start, end, markdownTable });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
replacements.sort((a, b) => b.start - a.start);
|
||||||
|
|
||||||
|
let modifiedHtml: string = html;
|
||||||
|
replacements.forEach(({ start, end, markdownTable }) => {
|
||||||
|
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
|
||||||
|
});
|
||||||
|
|
||||||
|
return modifiedHtml.trim();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
|
||||||
|
let rows: string[] = [];
|
||||||
|
let headerRowFound: boolean = false;
|
||||||
|
tableSoup("tr").each((i, tr) => {
|
||||||
|
const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
|
||||||
|
let cellText: string = tableSoup(cell).text().trim();
|
||||||
|
if (tableSoup(cell).is("th") && !headerRowFound) {
|
||||||
|
headerRowFound = true;
|
||||||
|
}
|
||||||
|
return ` ${cellText} |`;
|
||||||
|
}).get().join("");
|
||||||
|
if (cells) {
|
||||||
|
rows.push(`|${cells}`);
|
||||||
|
}
|
||||||
|
if (headerRowFound && i === 0) { // Header row
|
||||||
|
rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return rows.join('\n').trim();
|
||||||
|
};
|
||||||
|
|
||||||
|
export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
|
||||||
|
const cells: string = rowSoup("td, th").map((_, cell) => {
|
||||||
|
let cellText: string = rowSoup(cell).text().trim();
|
||||||
|
return ` ${cellText} |`;
|
||||||
|
}).get().join("");
|
||||||
|
|
||||||
|
return `|${cells}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
export function createMarkdownDividerRow(cellCount: number): string {
|
||||||
|
return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user