diff --git a/apps/api/package.json b/apps/api/package.json
index 9e3a3d8..e8e5e02 100644
--- a/apps/api/package.json
+++ b/apps/api/package.json
@@ -82,6 +82,7 @@
"scrapingbee": "^1.7.4",
"stripe": "^12.2.0",
"turndown": "^7.1.3",
+ "turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4",
"unstructured-client": "^0.9.4",
"uuid": "^9.0.1",
diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
index 3539868..8142189 100644
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@@ -134,6 +134,9 @@ dependencies:
turndown:
specifier: ^7.1.3
version: 7.1.3
+ turndown-plugin-gfm:
+ specifier: ^1.0.2
+ version: 1.0.2
typesense:
specifier: ^1.5.4
version: 1.7.2(@babel/runtime@7.24.0)
@@ -5783,6 +5786,10 @@ packages:
resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==}
dev: false
+ /turndown-plugin-gfm@1.0.2:
+ resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==}
+ dev: false
+
/turndown@7.1.3:
resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==}
dependencies:
diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts
index 6c816ab..0fd8c93 100644
--- a/apps/api/src/lib/html-to-markdown.ts
+++ b/apps/api/src/lib/html-to-markdown.ts
@@ -1,5 +1,6 @@
export function parseMarkdown(html: string) {
var TurndownService = require("turndown");
+ var turndownPluginGfm = require("turndown-plugin-gfm");
const turndownService = new TurndownService();
turndownService.addRule("inlineLink", {
@@ -16,7 +17,8 @@ export function parseMarkdown(html: string) {
return "[" + content.trim() + "](" + href + title + ")\n";
},
});
-
+ var gfm = turndownPluginGfm.gfm;
+ turndownService.use(gfm);
let markdownContent = turndownService.turndown(html);
// multiple line links
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 0cdbe51..f71221c 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import { Document } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
+import { parseTablesToMarkdown } from "./utils/parseTable";
// import puppeteer from "puppeteer";
dotenv.config();
@@ -132,7 +133,7 @@ export async function scrapSingleUrl(
}
break;
}
- const cleanedHtml = removeUnwantedElements(text);
+ let cleanedHtml = removeUnwantedElements(text);
return [await parseMarkdown(cleanedHtml), text];
};
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
new file mode 100644
index 0000000..8d644c7
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
@@ -0,0 +1,128 @@
+import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
+import cheerio from 'cheerio';
+
+describe('parseTablesToMarkdown', () => {
+ it('converts a simple HTML table to Markdown', async () => {
+ const html = `
+
+ Header 1 | Header 2 |
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single row to Markdown', async () => {
+ const html = `
+
+ Header 1 | Header 2 |
+ Row 1 Col 1 | Row 1 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single column to Markdown', async () => {
+ const html = `
+
+ Header 1 |
+ Row 1 Col 1 |
+ Row 2 Col 1 |
+
+ `;
+ const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with a single cell to Markdown', async () => {
+ const html = `
+
+ Header 1 |
+ Row 1 Col 1 |
+
+ `;
+ const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no header to Markdown', async () => {
+ const html = `
+
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+ `;
+ const expectedMarkdown = `| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no rows to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no cells to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no columns to Markdown', async () => {
+ const html = `
+
+ `;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+ it('converts a table with no table to Markdown', async () => {
+ const html = ``;
+ const expectedMarkdown = ``;
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+ });
+
+it('converts a table inside of a bunch of html noise', async () => {
+ const html = `
+
+
Some text before
+
+ Row 1 Col 1 | Row 1 Col 2 |
+ Row 2 Col 1 | Row 2 Col 2 |
+
+
Some text after
+
+ `;
+ const expectedMarkdown = `
+
Some text before
+
| Row 1 Col 1 | Row 1 Col 2 |
+| Row 2 Col 1 | Row 2 Col 2 |
+
Some text after
+
`;
+
+ const markdown = await parseTablesToMarkdown(html);
+ expect(markdown).toBe(expectedMarkdown);
+});
+
+});
diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
new file mode 100644
index 0000000..9855650
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
@@ -0,0 +1,74 @@
+import cheerio, { CheerioAPI } from "cheerio";
+
+interface Replacement {
+ start: number;
+ end: number;
+ markdownTable: string;
+}
+
+export const parseTablesToMarkdown = async (html: string): Promise => {
+ const soup: CheerioAPI = cheerio.load(html, {
+ xmlMode: true,
+ withStartIndices: true,
+ withEndIndices: true
+ });
+ let tables = soup("table");
+ let replacements: Replacement[] = [];
+
+ if (tables.length) {
+ tables.each((_, tableElement) => {
+ const start: number = tableElement.startIndex;
+ const end: number = tableElement.endIndex + 1; // Include the closing tag properly
+ let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
+ const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
+ if (isTableEmpty) {
+ markdownTable = '';
+ }
+ replacements.push({ start, end, markdownTable });
+ });
+ }
+
+ replacements.sort((a, b) => b.start - a.start);
+
+ let modifiedHtml: string = html;
+ replacements.forEach(({ start, end, markdownTable }) => {
+ modifiedHtml = modifiedHtml.slice(0, start) + `${markdownTable}
` + modifiedHtml.slice(end);
+ });
+
+ return modifiedHtml.trim();
+};
+
+export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
+ let rows: string[] = [];
+ let headerRowFound: boolean = false;
+ tableSoup("tr").each((i, tr) => {
+ const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
+ let cellText: string = tableSoup(cell).text().trim();
+ if (tableSoup(cell).is("th") && !headerRowFound) {
+ headerRowFound = true;
+ }
+ return ` ${cellText} |`;
+ }).get().join("");
+ if (cells) {
+ rows.push(`|${cells}`);
+ }
+ if (headerRowFound && i === 0) { // Header row
+ rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
+ }
+ });
+
+ return rows.join('\n').trim();
+};
+
+export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
+ const cells: string = rowSoup("td, th").map((_, cell) => {
+ let cellText: string = rowSoup(cell).text().trim();
+ return ` ${cellText} |`;
+ }).get().join("");
+
+ return `|${cells}`;
+};
+
+export function createMarkdownDividerRow(cellCount: number): string {
+ return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
+}
\ No newline at end of file