0
v-firecrawl/apps/api/src/lib/html-to-markdown.ts

56 lines
1.5 KiB
TypeScript
Raw Normal View History

2024-04-23 14:15:11 -04:00
2024-04-15 17:01:47 -04:00
export function parseMarkdown(html: string) {
var TurndownService = require("turndown");
2024-04-23 14:15:11 -04:00
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
2024-04-15 17:01:47 -04:00
const turndownService = new TurndownService();
turndownService.addRule("inlineLink", {
filter: function (node, options) {
return (
options.linkStyle === "inlined" &&
node.nodeName === "A" &&
node.getAttribute("href")
);
},
replacement: function (content, node) {
var href = node.getAttribute("href").trim();
var title = node.title ? ' "' + node.title + '"' : "";
return "[" + content.trim() + "](" + href + title + ")\n";
},
});
2024-04-17 15:44:23 -04:00
var gfm = turndownPluginGfm.gfm;
turndownService.use(gfm);
2024-04-15 17:01:47 -04:00
let markdownContent = turndownService.turndown(html);
// multiple line links
let insideLinkContent = false;
let newMarkdownContent = "";
let linkOpenCount = 0;
for (let i = 0; i < markdownContent.length; i++) {
const char = markdownContent[i];
if (char == "[") {
linkOpenCount++;
} else if (char == "]") {
linkOpenCount = Math.max(0, linkOpenCount - 1);
}
insideLinkContent = linkOpenCount > 0;
if (insideLinkContent && char == "\n") {
newMarkdownContent += "\\" + "\n";
} else {
newMarkdownContent += char;
}
}
markdownContent = newMarkdownContent;
// Remove [Skip to Content](#page) and [Skip to content](#skip)
markdownContent = markdownContent.replace(
/\[Skip to Content\]\(#[^\)]*\)/gi,
""
);
return markdownContent;
}