diff --git a/apps/api/package.json b/apps/api/package.json index 9e3a3d8..e8e5e02 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -82,6 +82,7 @@ "scrapingbee": "^1.7.4", "stripe": "^12.2.0", "turndown": "^7.1.3", + "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", "unstructured-client": "^0.9.4", "uuid": "^9.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 3539868..8142189 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -134,6 +134,9 @@ dependencies: turndown: specifier: ^7.1.3 version: 7.1.3 + turndown-plugin-gfm: + specifier: ^1.0.2 + version: 1.0.2 typesense: specifier: ^1.5.4 version: 1.7.2(@babel/runtime@7.24.0) @@ -5783,6 +5786,10 @@ packages: resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==} dev: false + /turndown-plugin-gfm@1.0.2: + resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==} + dev: false + /turndown@7.1.3: resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==} dependencies: diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 6c816ab..0fd8c93 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,5 +1,6 @@ export function parseMarkdown(html: string) { var TurndownService = require("turndown"); + var turndownPluginGfm = require("turndown-plugin-gfm"); const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { @@ -16,7 +17,8 @@ export function parseMarkdown(html: string) { return "[" + content.trim() + "](" + href + title + ")\n"; }, }); - + var gfm = turndownPluginGfm.gfm; + turndownService.use(gfm); let markdownContent = turndownService.turndown(html); // multiple line links diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0cdbe51..f71221c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; +import { parseTablesToMarkdown } from "./utils/parseTable"; // import puppeteer from "puppeteer"; dotenv.config(); @@ -132,7 +133,7 @@ export async function scrapSingleUrl( } break; } - const cleanedHtml = removeUnwantedElements(text); + let cleanedHtml = removeUnwantedElements(text); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts new file mode 100644 index 0000000..8d644c7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts @@ -0,0 +1,128 @@ +import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable'; +import cheerio from 'cheerio'; + +describe('parseTablesToMarkdown', () => { + it('converts a simple HTML table to Markdown', async () => { + const html = ` + + + + +
Header 1Header 2
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+ `; + const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single row to Markdown', async () => { + const html = ` + + + +
Header 1Header 2
Row 1 Col 1Row 1 Col 2
+ `; + const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single column to Markdown', async () => { + const html = ` + + + + +
Header 1
Row 1 Col 1
Row 2 Col 1
+ `; + const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with a single cell to Markdown', async () => { + const html = ` + + + +
Header 1
Row 1 Col 1
+ `; + const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no header to Markdown', async () => { + const html = ` + + + +
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+ `; + const expectedMarkdown = `
| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no rows to Markdown', async () => { + const html = ` + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no cells to Markdown', async () => { + const html = ` + + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no columns to Markdown', async () => { + const html = ` + + +
+ `; + const expectedMarkdown = `
`; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + + it('converts a table with no table to Markdown', async () => { + const html = ``; + const expectedMarkdown = ``; + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); + }); + +it('converts a table inside of a bunch of html noise', async () => { + const html = ` +
+

Some text before

+ + + +
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
+

Some text after

+
+ `; + const expectedMarkdown = `
+

Some text before

+
| Row 1 Col 1 | Row 1 Col 2 | +| Row 2 Col 1 | Row 2 Col 2 |
+

Some text after

+
`; + + const markdown = await parseTablesToMarkdown(html); + expect(markdown).toBe(expectedMarkdown); +}); + +}); diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts new file mode 100644 index 0000000..9855650 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -0,0 +1,74 @@ +import cheerio, { CheerioAPI } from "cheerio"; + +interface Replacement { + start: number; + end: number; + markdownTable: string; +} + +export const parseTablesToMarkdown = async (html: string): Promise => { + const soup: CheerioAPI = cheerio.load(html, { + xmlMode: true, + withStartIndices: true, + withEndIndices: true + }); + let tables = soup("table"); + let replacements: Replacement[] = []; + + if (tables.length) { + tables.each((_, tableElement) => { + const start: number = tableElement.startIndex; + const end: number = tableElement.endIndex + 1; // Include the closing tag properly + let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement)); + const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0; + if (isTableEmpty) { + markdownTable = ''; + } + replacements.push({ start, end, markdownTable }); + }); + } + + replacements.sort((a, b) => b.start - a.start); + + let modifiedHtml: string = html; + replacements.forEach(({ start, end, markdownTable }) => { + modifiedHtml = modifiedHtml.slice(0, start) + `
${markdownTable}
` + modifiedHtml.slice(end); + }); + + return modifiedHtml.trim(); +}; + +export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => { + let rows: string[] = []; + let headerRowFound: boolean = false; + tableSoup("tr").each((i, tr) => { + const cells: string = tableSoup(tr).find("th, td").map((_, cell) => { + let cellText: string = tableSoup(cell).text().trim(); + if (tableSoup(cell).is("th") && !headerRowFound) { + headerRowFound = true; + } + return ` ${cellText} |`; + }).get().join(""); + if (cells) { + rows.push(`|${cells}`); + } + if (headerRowFound && i === 0) { // Header row + rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length)); + } + }); + + return rows.join('\n').trim(); +}; + +export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string { + const cells: string = rowSoup("td, th").map((_, cell) => { + let cellText: string = rowSoup(cell).text().trim(); + return ` ${cellText} |`; + }).get().join(""); + + return `|${cells}`; +}; + +export function createMarkdownDividerRow(cellCount: number): string { + return '| ' + Array(cellCount).fill('---').join(' | ') + ' |'; +} \ No newline at end of file