From ff622739b74f3ad0b3215dbb7c589a387f0fb882 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 17 Apr 2024 11:01:19 -0300
Subject: [PATCH 1/3] Added a html to markdown table parser
---
apps/api/src/scraper/WebScraper/single_url.ts | 4 +-
.../scraper/WebScraper/utils/parseTable.ts | 66 +++++++++++++++++++
2 files changed, 69 insertions(+), 1 deletion(-)
create mode 100644 apps/api/src/scraper/WebScraper/utils/parseTable.ts
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 0cdbe51..faba56c 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import { Document } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
+import { parseTablesToMarkdown } from "./utils/parseTable";
// import puppeteer from "puppeteer";
dotenv.config();
@@ -132,7 +133,8 @@ export async function scrapSingleUrl(
}
break;
}
- const cleanedHtml = removeUnwantedElements(text);
+ let cleanedHtml = removeUnwantedElements(text);
+ cleanedHtml = await parseTablesToMarkdown(cleanedHtml);
return [await parseMarkdown(cleanedHtml), text];
};
diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
new file mode 100644
index 0000000..fdd90a7
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
@@ -0,0 +1,66 @@
+import cheerio from "cheerio";
+
+export const parseTablesToMarkdown = async (html: string) => {
+ let soup = cheerio.load(html, {
+ xmlMode: true,
+ withStartIndices: true,
+ withEndIndices: true
+ });
+ let tables = soup("table");
+ let replacements = [];
+
+
+ if (tables.length) {
+ for (const table of Array.from(tables)) {
+ const start = table.startIndex;
+ const end = table.endIndex;
+ const markdownTable = await convertTableElementToMarkdown(cheerio.load(table));
+ replacements.push({ start, end, markdownTable });
+ };
+ }
+
+ replacements.sort((a, b) => b.start - a.start);
+
+ let modifiedHtml = html;
+ for (const { start, end, markdownTable } of replacements) {
+ modifiedHtml = modifiedHtml.slice(0, start) + `
${markdownTable}
` + modifiedHtml.slice(end);
+ }
+
+ return modifiedHtml;
+}
+
+const convertTableElementToMarkdown = async (tableSoup) => {
+ const rows = [];
+ const trEls = tableSoup("tr");
+
+ trEls.each((i, tr) => {
+ const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i);
+ rows.push(markdownRow);
+ });
+
+ return rows.join('\n');
+}
+
+function convertTableRowElementToMarkdown(rowSoup, rowNumber) {
+ const cells = [];
+ const cellEls = rowSoup("td, th");
+
+ cellEls.each((i, cell) => {
+ let cellText = cheerio.load(cell).text();
+ cellText = cellText.replace(/\n/g, " ").trim();
+ cells.push(cellText + ' |');
+ });
+
+ let row = '| ' + cells.join(" ");
+
+ if (rowNumber === 0) {
+ row += '\n' + createMarkdownDividerRow(cellEls.length);
+ }
+
+ return row;
+}
+
+function createMarkdownDividerRow(cellCount) {
+ const dividerCells = Array(cellCount).fill('--- |');
+ return '| ' + dividerCells.join(" ");
+}
\ No newline at end of file
From ee8a097252e8f7ca19b8fc3d23c50599ed554773 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 17 Apr 2024 15:56:01 -0300
Subject: [PATCH 2/3] adding unit tests and fixing the parse function
---
.../utils/__tests__/parseTable.test.ts | 128 ++++++++++++++++++
.../scraper/WebScraper/utils/parseTable.ts | 95 +++++++------
2 files changed, 180 insertions(+), 43 deletions(-)
create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
new file mode 100644
index 0000000..8d644c7
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
@@ -0,0 +1,128 @@
+import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
+import cheerio from 'cheerio';
+
+describe('parseTablesToMarkdown', () => {
+ it('converts a simple HTML table to Markdown', async () => {
+ const html = `
+
+ Header 1 | Header 2 |
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single row to Markdown', async () => {
+ const html = `
+
+ Header 1 | Header 2 |
+ Row 1 Col 1 | Row 1 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single column to Markdown', async () => {
+ const html = `
+
+ Header 1 |
+ Row 1 Col 1 |
+ Row 2 Col 1 |
+
+ `;
+ const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single cell to Markdown', async () => {
+ const html = `
+
+ Header 1 |
+ Row 1 Col 1 |
+
+ `;
+ const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no header to Markdown', async () => {
+ const html = `
+
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no rows to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no cells to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no columns to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no table to Markdown', async () => {
+ const html = ``;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+it('converts a table inside of a bunch of html noise', async () => {
+ const html = `
+
+
Some text before
+
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+
Some text after
+
+ `;
+ const expectedMarkdown = `
+
Some text before
+
| Row 1 Col 1 | Row 1 Col 2 |
+| Row 2 Col 1 | Row 2 Col 2 |
+
Some text after
+
`;
+
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+});
+
+});
diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
index fdd90a7..7d0a602 100644
--- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts
+++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
@@ -1,66 +1,75 @@
-import cheerio from "cheerio";
+import cheerio, { CheerioAPI } from "cheerio";
-export const parseTablesToMarkdown = async (html: string) => {
- let soup = cheerio.load(html, {
+interface Replacement {
+ start: number;
+ end: number;
+ markdownTable: string;
+}
+
+export const parseTablesToMarkdown = async (html: string): Promise => {
+ const soup: CheerioAPI = cheerio.load(html, {
xmlMode: true,
withStartIndices: true,
withEndIndices: true
});
let tables = soup("table");
- let replacements = [];
-
+ let replacements: Replacement[] = [];
if (tables.length) {
- for (const table of Array.from(tables)) {
- const start = table.startIndex;
- const end = table.endIndex;
- const markdownTable = await convertTableElementToMarkdown(cheerio.load(table));
+ tables.each((_, tableElement) => {
+ const start: number = tableElement.startIndex;
+ const end: number = tableElement.endIndex + 1; // Include the closing tag properly
+ let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
+ const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
+ if (isTableEmpty) {
+ markdownTable = '';
+ }
+ console.log({markdownTable})
replacements.push({ start, end, markdownTable });
- };
+ });
}
replacements.sort((a, b) => b.start - a.start);
- let modifiedHtml = html;
- for (const { start, end, markdownTable } of replacements) {
+ let modifiedHtml: string = html;
+ replacements.forEach(({ start, end, markdownTable }) => {
modifiedHtml = modifiedHtml.slice(0, start) + `${markdownTable}
` + modifiedHtml.slice(end);
- }
-
- return modifiedHtml;
-}
-
-const convertTableElementToMarkdown = async (tableSoup) => {
- const rows = [];
- const trEls = tableSoup("tr");
-
- trEls.each((i, tr) => {
- const markdownRow = convertTableRowElementToMarkdown(cheerio.load(tr), i);
- rows.push(markdownRow);
});
- return rows.join('\n');
-}
+ return modifiedHtml.trim();
+};
-function convertTableRowElementToMarkdown(rowSoup, rowNumber) {
- const cells = [];
- const cellEls = rowSoup("td, th");
-
- cellEls.each((i, cell) => {
- let cellText = cheerio.load(cell).text();
- cellText = cellText.replace(/\n/g, " ").trim();
- cells.push(cellText + ' |');
+export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
+ let rows: string[] = [];
+ let headerRowFound: boolean = false;
+ tableSoup("tr").each((i, tr) => {
+ const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
+ let cellText: string = tableSoup(cell).text().trim();
+ if (tableSoup(cell).is("th") && !headerRowFound) {
+ headerRowFound = true;
+ }
+ return ` ${cellText} |`;
+ }).get().join("");
+ if (cells) {
+ rows.push(`|${cells}`);
+ }
+ if (headerRowFound && i === 0) { // Header row
+ rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
+ }
});
- let row = '| ' + cells.join(" ");
+ return rows.join('\n').trim();
+};
- if (rowNumber === 0) {
- row += '\n' + createMarkdownDividerRow(cellEls.length);
- }
+export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
+ const cells: string = rowSoup("td, th").map((_, cell) => {
+ let cellText: string = rowSoup(cell).text().trim();
+ return ` ${cellText} |`;
+ }).get().join("");
- return row;
-}
+ return `|${cells}`;
+};
-function createMarkdownDividerRow(cellCount) {
- const dividerCells = Array(cellCount).fill('--- |');
- return '| ' + dividerCells.join(" ");
+export function createMarkdownDividerRow(cellCount: number): string {
+ return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
}
\ No newline at end of file
From 08ed68ff5592bc31897f7849f625f277a24a9bf1 Mon Sep 17 00:00:00 2001
From: Nicolas
Date: Wed, 17 Apr 2024 12:44:23 -0700
Subject: [PATCH 3/3] Nick: fixes
---
apps/api/package.json | 1 +
apps/api/pnpm-lock.yaml | 7 +++++++
apps/api/src/lib/html-to-markdown.ts | 4 +++-
apps/api/src/scraper/WebScraper/single_url.ts | 1 -
apps/api/src/scraper/WebScraper/utils/parseTable.ts | 1 -
5 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/apps/api/package.json b/apps/api/package.json
index 9e3a3d8..e8e5e02 100644
--- a/apps/api/package.json
+++ b/apps/api/package.json
@@ -82,6 +82,7 @@
"scrapingbee": "^1.7.4",
"stripe": "^12.2.0",
"turndown": "^7.1.3",
+ "turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4",
"unstructured-client": "^0.9.4",
"uuid": "^9.0.1",
diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
index 3539868..8142189 100644
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@@ -134,6 +134,9 @@ dependencies:
turndown:
specifier: ^7.1.3
version: 7.1.3
+ turndown-plugin-gfm:
+ specifier: ^1.0.2
+ version: 1.0.2
typesense:
specifier: ^1.5.4
version: 1.7.2(@babel/runtime@7.24.0)
@@ -5783,6 +5786,10 @@ packages:
resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==}
dev: false
+ /turndown-plugin-gfm@1.0.2:
+ resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==}
+ dev: false
+
/turndown@7.1.3:
resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==}
dependencies:
diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts
index 6c816ab..0fd8c93 100644
--- a/apps/api/src/lib/html-to-markdown.ts
+++ b/apps/api/src/lib/html-to-markdown.ts
@@ -1,5 +1,6 @@
export function parseMarkdown(html: string) {
var TurndownService = require("turndown");
+ var turndownPluginGfm = require("turndown-plugin-gfm");
const turndownService = new TurndownService();
turndownService.addRule("inlineLink", {
@@ -16,7 +17,8 @@ export function parseMarkdown(html: string) {
return "[" + content.trim() + "](" + href + title + ")\n";
},
});
-
+ var gfm = turndownPluginGfm.gfm;
+ turndownService.use(gfm);
let markdownContent = turndownService.turndown(html);
// multiple line links
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index faba56c..f71221c 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -134,7 +134,6 @@ export async function scrapSingleUrl(
break;
}
let cleanedHtml = removeUnwantedElements(text);
- cleanedHtml = await parseTablesToMarkdown(cleanedHtml);
return [await parseMarkdown(cleanedHtml), text];
};
diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
index 7d0a602..9855650 100644
--- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts
+++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
@@ -24,7 +24,6 @@ export const parseTablesToMarkdown = async (html: string): Promise => {
if (isTableEmpty) {
markdownTable = '';
}
- console.log({markdownTable})
replacements.push({ start, end, markdownTable });
});
}