diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts new file mode 100644 index 0000000..8d644c7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts @@ -0,0 +1,128 @@ +import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable'; +import cheerio from 'cheerio'; + +describe('parseTablesToMarkdown', () => { + it('converts a simple HTML table to Markdown', async () => { + const html = ` + + + + +
Header 1Header 2
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+ `; + const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single row to Markdown', async () => { + const html = ` + + + +
Header 1Header 2
Row 1 Col 1Row 1 Col 2
+ `; + const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single column to Markdown', async () => { + const html = ` + + + + +
Header 1
Row 1 Col 1
Row 2 Col 1
+ `; + const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single cell to Markdown', async () => { + const html = ` + + + +
Header 1
Row 1 Col 1
+ `; + const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no header to Markdown', async () => { + const html = ` + + + +
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+ `; + const expectedMarkdown = `
| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no rows to Markdown', async () => { + const html = ` + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no cells to Markdown', async () => { + const html = ` + + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no columns to Markdown', async () => { + const html = ` + + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no table to Markdown', async () => { + const html = ``; + const expectedMarkdown = ``; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + +it('converts a table inside of a bunch of html noise', async () => { + const html = ` +
+

Some text before

+ + + +
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+

Some text after

+
+ `; + const expectedMarkdown = `
+

Some text before

+
| Row 1 Col 1 | Row 1 Col 2 | +| Row 2 Col 1 | Row 2 Col 2 |
+

Some text after

+
`; + + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); +}); + +}); diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts index fdd90a7..7d0a602 100644 --- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -1,66 +1,75 @@ -import cheerio from "cheerio"; +import cheerio, { CheerioAPI } from "cheerio"; -export const parseTablesToMarkdown = async (html: string) => { - let soup = cheerio.load(html, { +interface Replacement { + start: number; + end: number; + markdownTable: string; +} + +export const parseTablesToMarkdown = async (html: string): Promise => { + const soup: CheerioAPI = cheerio.load(html, { xmlMode: true, withStartIndices: true, withEndIndices: true }); let tables = soup("table"); - let replacements = []; - + let replacements: Replacement[] = []; if (tables.length) { - for (const table of Array.from(tables)) { - const start = table.startIndex; - const end = table.endIndex; - const markdownTable = await convertTableElementToMarkdown(cheerio.load(table)); + tables.each((_, tableElement) => { + const start: number = tableElement.startIndex; + const end: number = tableElement.endIndex + 1; // Include the closing tag properly + let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement)); + const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0; + if (isTableEmpty) { + markdownTable = ''; + } + console.log({markdownTable}) replacements.push({ start, end, markdownTable }); - }; + }); } replacements.sort((a, b) => b.start - a.start); - let modifiedHtml = html; - for (const { start, end, markdownTable } of replacements) { + let modifiedHtml: string = html; + replacements.forEach(({ start, end, markdownTable }) => { modifiedHtml = modifiedHtml.slice(0, start) + `
${markdownTable}
` + modifiedHtml.slice(end); - } - - return modifiedHtml; -} - -const convertTableElementToMarkdown = async (tableSoup) => { - const rows = []; - const trEls = tableSoup("tr"); - - trEls.each((i, tr) => { - const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i); - rows.push(markdownRow); }); - return rows.join('\n'); -} + return modifiedHtml.trim(); +}; -function convertTableRowElementToMarkdown(rowSoup, rowNumber) { - const cells = []; - const cellEls = rowSoup("td, th"); - - cellEls.each((i, cell) => { - let cellText = cheerio.load(cell).text(); - cellText = cellText.replace(/\n/g, " ").trim(); - cells.push(cellText + ' |'); +export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => { + let rows: string[] = []; + let headerRowFound: boolean = false; + tableSoup("tr").each((i, tr) => { + const cells: string = tableSoup(tr).find("th, td").map((_, cell) => { + let cellText: string = tableSoup(cell).text().trim(); + if (tableSoup(cell).is("th") && !headerRowFound) { + headerRowFound = true; + } + return ` ${cellText} |`; + }).get().join(""); + if (cells) { + rows.push(`|${cells}`); + } + if (headerRowFound && i === 0) { // Header row + rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length)); + } }); - let row = '| ' + cells.join(" "); + return rows.join('\n').trim(); +}; - if (rowNumber === 0) { - row += '\n' + createMarkdownDividerRow(cellEls.length); - } +export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string { + const cells: string = rowSoup("td, th").map((_, cell) => { + let cellText: string = rowSoup(cell).text().trim(); + return ` ${cellText} |`; + }).get().join(""); - return row; -} + return `|${cells}`; +}; -function createMarkdownDividerRow(cellCount) { - const dividerCells = Array(cellCount).fill('--- |'); - return '| ' + dividerCells.join(" "); +export function createMarkdownDividerRow(cellCount: number): string { + return '| ' + Array(cellCount).fill('---').join(' | ') + ' |'; } \ No newline at end of file