From ee8a097252e8f7ca19b8fc3d23c50599ed554773 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 17 Apr 2024 15:56:01 -0300
Subject: [PATCH] adding unit tests and fixing the parse function
---
.../utils/__tests__/parseTable.test.ts | 128 ++++++++++++++++++
.../scraper/WebScraper/utils/parseTable.ts | 95 +++++++------
2 files changed, 180 insertions(+), 43 deletions(-)
create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
new file mode 100644
index 0000000..8d644c7
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
@@ -0,0 +1,128 @@
+import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
+import cheerio from 'cheerio';
+
+describe('parseTablesToMarkdown', () => {
+ it('converts a simple HTML table to Markdown', async () => {
+ const html = `
+
+ Header 1 | Header 2 |
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single row to Markdown', async () => {
+ const html = `
+
+ Header 1 | Header 2 |
+ Row 1 Col 1 | Row 1 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single column to Markdown', async () => {
+ const html = `
+
+ Header 1 |
+ Row 1 Col 1 |
+ Row 2 Col 1 |
+
+ `;
+ const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single cell to Markdown', async () => {
+ const html = `
+
+ Header 1 |
+ Row 1 Col 1 |
+
+ `;
+ const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no header to Markdown', async () => {
+ const html = `
+
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no rows to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no cells to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no columns to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no table to Markdown', async () => {
+ const html = ``;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+it('converts a table inside of a bunch of html noise', async () => {
+ const html = `
+
+
Some text before
+
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+
Some text after
+
+ `;
+ const expectedMarkdown = `
+
Some text before
+
| Row 1 Col 1 | Row 1 Col 2 |
+| Row 2 Col 1 | Row 2 Col 2 |
+
Some text after
+
`;
+
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+});
+
+});
diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
index fdd90a7..7d0a602 100644
--- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts
+++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
@@ -1,66 +1,75 @@
-import cheerio from "cheerio";
+import cheerio, { CheerioAPI } from "cheerio";
-export const parseTablesToMarkdown = async (html: string) => {
- let soup = cheerio.load(html, {
+interface Replacement {
+ start: number;
+ end: number;
+ markdownTable: string;
+}
+
+export const parseTablesToMarkdown = async (html: string): Promise => {
+ const soup: CheerioAPI = cheerio.load(html, {
xmlMode: true,
withStartIndices: true,
withEndIndices: true
});
let tables = soup("table");
- let replacements = [];
-
+ let replacements: Replacement[] = [];
if (tables.length) {
- for (const table of Array.from(tables)) {
- const start = table.startIndex;
- const end = table.endIndex;
- const markdownTable = await convertTableElementToMarkdown(cheerio.load(table));
+ tables.each((_, tableElement) => {
+ const start: number = tableElement.startIndex;
+ const end: number = tableElement.endIndex + 1; // Include the closing tag properly
+ let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
+ const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
+ if (isTableEmpty) {
+ markdownTable = '';
+ }
+ console.log({markdownTable})
replacements.push({ start, end, markdownTable });
- };
+ });
}
replacements.sort((a, b) => b.start - a.start);
- let modifiedHtml = html;
- for (const { start, end, markdownTable } of replacements) {
+ let modifiedHtml: string = html;
+ replacements.forEach(({ start, end, markdownTable }) => {
modifiedHtml = modifiedHtml.slice(0, start) + `${markdownTable}
` + modifiedHtml.slice(end);
- }
-
- return modifiedHtml;
-}
-
-const convertTableElementToMarkdown = async (tableSoup) => {
- const rows = [];
- const trEls = tableSoup("tr");
-
- trEls.each((i, tr) => {
- const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i);
- rows.push(markdownRow);
});
- return rows.join('\n');
-}
+ return modifiedHtml.trim();
+};
-function convertTableRowElementToMarkdown(rowSoup, rowNumber) {
- const cells = [];
- const cellEls = rowSoup("td, th");
-
- cellEls.each((i, cell) => {
- let cellText = cheerio.load(cell).text();
- cellText = cellText.replace(/\n/g, " ").trim();
- cells.push(cellText + ' |');
+export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
+ let rows: string[] = [];
+ let headerRowFound: boolean = false;
+ tableSoup("tr").each((i, tr) => {
+ const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
+ let cellText: string = tableSoup(cell).text().trim();
+ if (tableSoup(cell).is("th") && !headerRowFound) {
+ headerRowFound = true;
+ }
+ return ` ${cellText} |`;
+ }).get().join("");
+ if (cells) {
+ rows.push(`|${cells}`);
+ }
+ if (headerRowFound && i === 0) { // Header row
+ rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
+ }
});
- let row = '| ' + cells.join(" ");
+ return rows.join('\n').trim();
+};
- if (rowNumber === 0) {
- row += '\n' + createMarkdownDividerRow(cellEls.length);
- }
+export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
+ const cells: string = rowSoup("td, th").map((_, cell) => {
+ let cellText: string = rowSoup(cell).text().trim();
+ return ` ${cellText} |`;
+ }).get().join("");
- return row;
-}
+ return `|${cells}`;
+};
-function createMarkdownDividerRow(cellCount) {
- const dividerCells = Array(cellCount).fill('--- |');
- return '| ' + dividerCells.join(" ");
+export function createMarkdownDividerRow(cellCount: number): string {
+ return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
}
\ No newline at end of file