Merge pull request #52 from mendableai/nsc/fix-tables
Fixes table parsing for websites such as news.ycombinator.com (HN)
This commit is contained in:
commit
c70bc08d73
@ -11,8 +11,8 @@
|
|||||||
"start:dev": "nodemon --exec ts-node src/index.ts",
|
"start:dev": "nodemon --exec ts-node src/index.ts",
|
||||||
"build": "tsc",
|
"build": "tsc",
|
||||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
||||||
"test:local-no-auth":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
||||||
"test:prod":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
||||||
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
|
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
|
||||||
"worker:production": "node dist/src/services/queue-worker.js",
|
"worker:production": "node dist/src/services/queue-worker.js",
|
||||||
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
|
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
|
||||||
@ -66,6 +66,7 @@
|
|||||||
"glob": "^10.3.12",
|
"glob": "^10.3.12",
|
||||||
"gpt3-tokenizer": "^1.1.5",
|
"gpt3-tokenizer": "^1.1.5",
|
||||||
"ioredis": "^5.3.2",
|
"ioredis": "^5.3.2",
|
||||||
|
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||||
"keyword-extractor": "^0.0.25",
|
"keyword-extractor": "^0.0.25",
|
||||||
"langchain": "^0.1.25",
|
"langchain": "^0.1.25",
|
||||||
"languagedetect": "^2.0.0",
|
"languagedetect": "^2.0.0",
|
||||||
|
@ -80,6 +80,9 @@ dependencies:
|
|||||||
ioredis:
|
ioredis:
|
||||||
specifier: ^5.3.2
|
specifier: ^5.3.2
|
||||||
version: 5.3.2
|
version: 5.3.2
|
||||||
|
joplin-turndown-plugin-gfm:
|
||||||
|
specifier: ^1.0.12
|
||||||
|
version: 1.0.12
|
||||||
keyword-extractor:
|
keyword-extractor:
|
||||||
specifier: ^0.0.25
|
specifier: ^0.0.25
|
||||||
version: 0.0.25
|
version: 0.0.25
|
||||||
@ -3923,6 +3926,10 @@ packages:
|
|||||||
- ts-node
|
- ts-node
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
|
/joplin-turndown-plugin-gfm@1.0.12:
|
||||||
|
resolution: {integrity: sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/js-tiktoken@1.0.10:
|
/js-tiktoken@1.0.10:
|
||||||
resolution: {integrity: sha512-ZoSxbGjvGyMT13x6ACo9ebhDha/0FHdKA+OsQcMOWcm1Zs7r90Rhk5lhERLzji+3rA7EKpXCgwXcM5fF3DMpdA==}
|
resolution: {integrity: sha512-ZoSxbGjvGyMT13x6ACo9ebhDha/0FHdKA+OsQcMOWcm1Zs7r90Rhk5lhERLzji+3rA7EKpXCgwXcM5fF3DMpdA==}
|
||||||
dependencies:
|
dependencies:
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
|
|
||||||
export function parseMarkdown(html: string) {
|
export function parseMarkdown(html: string) {
|
||||||
var TurndownService = require("turndown");
|
var TurndownService = require("turndown");
|
||||||
var turndownPluginGfm = require("turndown-plugin-gfm");
|
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
||||||
|
|
||||||
|
|
||||||
const turndownService = new TurndownService();
|
const turndownService = new TurndownService();
|
||||||
turndownService.addRule("inlineLink", {
|
turndownService.addRule("inlineLink", {
|
||||||
|
@ -142,6 +142,7 @@ export async function scrapSingleUrl(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user