diff --git a/apps/api/package.json b/apps/api/package.json index 9e3a3d8..e8e5e02 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -82,6 +82,7 @@ "scrapingbee": "^1.7.4", "stripe": "^12.2.0", "turndown": "^7.1.3", + "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", "unstructured-client": "^0.9.4", "uuid": "^9.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 3539868..8142189 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -134,6 +134,9 @@ dependencies: turndown: specifier: ^7.1.3 version: 7.1.3 + turndown-plugin-gfm: + specifier: ^1.0.2 + version: 1.0.2 typesense: specifier: ^1.5.4 version: 1.7.2(@babel/runtime@7.24.0) @@ -5783,6 +5786,10 @@ packages: resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==} dev: false + /turndown-plugin-gfm@1.0.2: + resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==} + dev: false + /turndown@7.1.3: resolution: {integrity: sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==} dependencies: diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 6c816ab..0fd8c93 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,5 +1,6 @@ export function parseMarkdown(html: string) { var TurndownService = require("turndown"); + var turndownPluginGfm = require("turndown-plugin-gfm"); const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { @@ -16,7 +17,8 @@ export function parseMarkdown(html: string) { return "[" + content.trim() + "](" + href + title + ")\n"; }, }); - + var gfm = turndownPluginGfm.gfm; + turndownService.use(gfm); let markdownContent = turndownService.turndown(html); // multiple line links diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index faba56c..f71221c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -134,7 +134,6 @@ export async function scrapSingleUrl( break; } let cleanedHtml = removeUnwantedElements(text); - cleanedHtml = await parseTablesToMarkdown(cleanedHtml); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/parseTable.ts b/apps/api/src/scraper/WebScraper/utils/parseTable.ts index 7d0a602..9855650 100644 --- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts +++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts @@ -24,7 +24,6 @@ export const parseTablesToMarkdown = async (html: string): Promise => { if (isTableEmpty) { markdownTable = ''; } - console.log({markdownTable}) replacements.push({ start, end, markdownTable }); }); }