From ff622739b74f3ad0b3215dbb7c589a387f0fb882 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Apr 2024 11:01:19 -0300 Subject: [PATCH 1/3] Added a html to markdown table parser --- apps/api/src/scraper/WebScraper/single_url.ts | 4 +- .../scraper/WebScraper/utils/parseTable.ts | 66 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/parseTable.ts diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0cdbe51..faba56c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; +import { parseTablesToMarkdown } from "./utils/parseTable"; // import puppeteer from "puppeteer"; dotenv.config(); @@ -132,7 +133,8 @@ export async function scrapSingleUrl( } break; } - const cleanedHtml = removeUnwantedElements(text); + let cleanedHtml = removeUnwantedElements(text); + cleanedHtml = await parseTablesToMarkdown(cleanedHtml); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts new file mode 100644 index 0000000..fdd90a7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -0,0 +1,66 @@ +import cheerio from "cheerio"; + +export const parseTablesToMarkdown = async (html: string) => { + let soup = cheerio.load(html, { + xmlMode: true, + withStartIndices: true, + withEndIndices: true + }); + let tables = soup("table"); + let replacements = []; + + + if (tables.length) { + for (const table of Array.from(tables)) { + const start = table.startIndex; + const end = table.endIndex; + const markdownTable = await convertTableElementToMarkdown(cheerio.load(table)); + replacements.push({ start, end, markdownTable }); + }; + } + + replacements.sort((a, b) => b.start - a.start); + + let modifiedHtml = html; + for (const { start, end, markdownTable } of replacements) { + modifiedHtml = modifiedHtml.slice(0, start) + `
${markdownTable}
` + modifiedHtml.slice(end); + } + + return modifiedHtml; +} + +const convertTableElementToMarkdown = async (tableSoup) => { + const rows = []; + const trEls = tableSoup("tr"); + + trEls.each((i, tr) => { + const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i); + rows.push(markdownRow); + }); + + return rows.join('\n'); +} + +function convertTableRowElementToMarkdown(rowSoup, rowNumber) { + const cells = []; + const cellEls = rowSoup("td, th"); + + cellEls.each((i, cell) => { + let cellText = cheerio.load(cell).text(); + cellText = cellText.replace(/\n/g, " ").trim(); + cells.push(cellText + ' |'); + }); + + let row = '| ' + cells.join(" "); + + if (rowNumber === 0) { + row += '\n' + createMarkdownDividerRow(cellEls.length); + } + + return row; +} + +function createMarkdownDividerRow(cellCount) { + const dividerCells = Array(cellCount).fill('--- |'); + return '| ' + dividerCells.join(" "); +} \ No newline at end of file From ee8a097252e8f7ca19b8fc3d23c50599ed554773 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Apr 2024 15:56:01 -0300 Subject: [PATCH 2/3] adding unit tests and fixing the parse function --- .../utils/__tests__/parseTable.test.ts | 128 ++++++++++++++++++ .../scraper/WebScraper/utils/parseTable.ts | 95 +++++++------ 2 files changed, 180 insertions(+), 43 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts new file mode 100644 index 0000000..8d644c7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts @@ -0,0 +1,128 @@ +import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable'; +import cheerio from 'cheerio'; + +describe('parseTablesToMarkdown', () => { + it('converts a simple HTML table to Markdown', async () => { + const html = ` + + + + +
Header 1Header 2
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+ `; + const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single row to Markdown', async () => { + const html = ` + + + +
Header 1Header 2
Row 1 Col 1Row 1 Col 2
+ `; + const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single column to Markdown', async () => { + const html = ` + + + + +
Header 1
Row 1 Col 1
Row 2 Col 1
+ `; + const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single cell to Markdown', async () => { + const html = ` + + + +
Header 1
Row 1 Col 1
+ `; + const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no header to Markdown', async () => { + const html = ` + + + +
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+ `; + const expectedMarkdown = `
| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no rows to Markdown', async () => { + const html = ` + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no cells to Markdown', async () => { + const html = ` + + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no columns to Markdown', async () => { + const html = ` + + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no table to Markdown', async () => { + const html = ``; + const expectedMarkdown = ``; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + +it('converts a table inside of a bunch of html noise', async () => { + const html = ` +
+

Some text before

+ + + +
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+

Some text after

+
+ `; + const expectedMarkdown = `
+

Some text before

+
| Row 1 Col 1 | Row 1 Col 2 | +| Row 2 Col 1 | Row 2 Col 2 |
+

Some text after

+
`; + + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); +}); + +}); diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts index fdd90a7..7d0a602 100644 --- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -1,66 +1,75 @@ -import cheerio from "cheerio"; +import cheerio, { CheerioAPI } from "cheerio"; -export const parseTablesToMarkdown = async (html: string) => { - let soup = cheerio.load(html, { +interface Replacement { + start: number; + end: number; + markdownTable: string; +} + +export const parseTablesToMarkdown = async (html: string): Promise => { + const soup: CheerioAPI = cheerio.load(html, { xmlMode: true, withStartIndices: true, withEndIndices: true }); let tables = soup("table"); - let replacements = []; - + let replacements: Replacement[] = []; if (tables.length) { - for (const table of Array.from(tables)) { - const start = table.startIndex; - const end = table.endIndex; - const markdownTable = await convertTableElementToMarkdown(cheerio.load(table)); + tables.each((_, tableElement) => { + const start: number = tableElement.startIndex; + const end: number = tableElement.endIndex + 1; // Include the closing tag properly + let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement)); + const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0; + if (isTableEmpty) { + markdownTable = ''; + } + console.log({markdownTable}) replacements.push({ start, end, markdownTable }); - }; + }); } replacements.sort((a, b) => b.start - a.start); - let modifiedHtml = html; - for (const { start, end, markdownTable } of replacements) { + let modifiedHtml: string = html; + replacements.forEach(({ start, end, markdownTable }) => { modifiedHtml = modifiedHtml.slice(0, start) + `
${markdownTable}
` + modifiedHtml.slice(end); - } - - return modifiedHtml; -} - -const convertTableElementToMarkdown = async (tableSoup) => { - const rows = []; - const trEls = tableSoup("tr"); - - trEls.each((i, tr) => { - const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i); - rows.push(markdownRow); }); - return rows.join('\n'); -} + return modifiedHtml.trim(); +}; -function convertTableRowElementToMarkdown(rowSoup, rowNumber) { - const cells = []; - const cellEls = rowSoup("td, th"); - - cellEls.each((i, cell) => { - let cellText = cheerio.load(cell).text(); - cellText = cellText.replace(/\n/g, " ").trim(); - cells.push(cellText + ' |'); +export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => { + let rows: string[] = []; + let headerRowFound: boolean = false; + tableSoup("tr").each((i, tr) => { + const cells: string = tableSoup(tr).find("th, td").map((_, cell) => { + let cellText: string = tableSoup(cell).text().trim(); + if (tableSoup(cell).is("th") && !headerRowFound) { + headerRowFound = true; + } + return ` ${cellText} |`; + }).get().join(""); + if (cells) { + rows.push(`|${cells}`); + } + if (headerRowFound && i === 0) { // Header row + rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length)); + } }); - let row = '| ' + cells.join(" "); + return rows.join('\n').trim(); +}; - if (rowNumber === 0) { - row += '\n' + createMarkdownDividerRow(cellEls.length); - } +export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string { + const cells: string = rowSoup("td, th").map((_, cell) => { + let cellText: string = rowSoup(cell).text().trim(); + return ` ${cellText} |`; + }).get().join(""); - return row; -} + return `|${cells}`; +}; -function createMarkdownDividerRow(cellCount) { - const dividerCells = Array(cellCount).fill('--- |'); - return '| ' + dividerCells.join(" "); +export function createMarkdownDividerRow(cellCount: number): string { + return '| ' + Array(cellCount).fill('---').join(' | ') + ' |'; } \ No newline at end of file From 08ed68ff5592bc31897f7849f625f277a24a9bf1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:44:23 -0700 Subject: [PATCH 3/3] Nick: fixes --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 7 +++++++ apps/api/src/lib/html-to-markdown.ts | 4 +++- apps/api/src/scraper/WebScraper/single_url.ts | 1 - apps/api/src/scraper/WebScraper/utils/parseTable.ts | 1 - 5 files changed, 11 insertions(+), 3 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 9e3a3d8..e8e5e02 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -82,6 +82,7 @@ "scrapingbee": "^1.7.4", "stripe": "^12.2.0", "turndown": "^7.1.3", + "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", "unstructured-client": "^0.9.4", "uuid": "^9.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 3539868..8142189 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -134,6 +134,9 @@ dependencies: turndown: specifier: ^7.1.3 version: 7.1.3 + turndown-plugin-gfm: + specifier: ^1.0.2 + version: 1.0.2 typesense: specifier: ^1.5.4 version: 1.7.2(@babel/runtime@7.24.0) @@ -5783,6 +5786,10 @@ packages: resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==} dev: false + /turndown-plugin-gfm@1.0.2: + resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==} + dev: false + /turndown@7.1.3: resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==} dependencies: diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 6c816ab..0fd8c93 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,5 +1,6 @@ export function parseMarkdown(html: string) { var TurndownService = require("turndown"); + var turndownPluginGfm = require("turndown-plugin-gfm"); const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { @@ -16,7 +17,8 @@ export function parseMarkdown(html: string) { return "[" + content.trim() + "](" + href + title + ")\n"; }, }); - + var gfm = turndownPluginGfm.gfm; + turndownService.use(gfm); let markdownContent = turndownService.turndown(html); // multiple line links diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index faba56c..f71221c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -134,7 +134,6 @@ export async function scrapSingleUrl( break; } let cleanedHtml = removeUnwantedElements(text); - cleanedHtml = await parseTablesToMarkdown(cleanedHtml); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts index 7d0a602..9855650 100644 --- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -24,7 +24,6 @@ export const parseTablesToMarkdown = async (html: string): Promise => { if (isTableEmpty) { markdownTable = ''; } - console.log({markdownTable}) replacements.push({ start, end, markdownTable }); }); }