0
This commit is contained in:
Nicolas 2024-04-23 11:15:11 -07:00
parent 357914c07d
commit 306cfe4ce1
4 changed files with 14 additions and 3 deletions

View File

@ -11,8 +11,8 @@
"start:dev": "nodemon --exec ts-node src/index.ts", "start:dev": "nodemon --exec ts-node src/index.ts",
"build": "tsc", "build": "tsc",
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
"test:local-no-auth":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
"test:prod":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
"workers": "nodemon --exec ts-node src/services/queue-worker.ts", "workers": "nodemon --exec ts-node src/services/queue-worker.ts",
"worker:production": "node dist/src/services/queue-worker.js", "worker:production": "node dist/src/services/queue-worker.js",
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
@ -66,6 +66,7 @@
"glob": "^10.3.12", "glob": "^10.3.12",
"gpt3-tokenizer": "^1.1.5", "gpt3-tokenizer": "^1.1.5",
"ioredis": "^5.3.2", "ioredis": "^5.3.2",
"joplin-turndown-plugin-gfm": "^1.0.12",
"keyword-extractor": "^0.0.25", "keyword-extractor": "^0.0.25",
"langchain": "^0.1.25", "langchain": "^0.1.25",
"languagedetect": "^2.0.0", "languagedetect": "^2.0.0",

View File

@ -80,6 +80,9 @@ dependencies:
ioredis: ioredis:
specifier: ^5.3.2 specifier: ^5.3.2
version: 5.3.2 version: 5.3.2
joplin-turndown-plugin-gfm:
specifier: ^1.0.12
version: 1.0.12
keyword-extractor: keyword-extractor:
specifier: ^0.0.25 specifier: ^0.0.25
version: 0.0.25 version: 0.0.25
@ -3923,6 +3926,10 @@ packages:
- ts-node - ts-node
dev: true dev: true
/joplin-turndown-plugin-gfm@1.0.12:
resolution: {integrity: sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA==}
dev: false
/js-tiktoken@1.0.10: /js-tiktoken@1.0.10:
resolution: {integrity: sha512-ZoSxbGjvGyMT13x6ACo9ebhDha/0FHdKA+OsQcMOWcm1Zs7r90Rhk5lhERLzji+3rA7EKpXCgwXcM5fF3DMpdA==} resolution: {integrity: sha512-ZoSxbGjvGyMT13x6ACo9ebhDha/0FHdKA+OsQcMOWcm1Zs7r90Rhk5lhERLzji+3rA7EKpXCgwXcM5fF3DMpdA==}
dependencies: dependencies:

View File

@ -1,6 +1,8 @@
export function parseMarkdown(html: string) { export function parseMarkdown(html: string) {
var TurndownService = require("turndown"); var TurndownService = require("turndown");
var turndownPluginGfm = require("turndown-plugin-gfm"); var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
const turndownService = new TurndownService(); const turndownService = new TurndownService();
turndownService.addRule("inlineLink", { turndownService.addRule("inlineLink", {

View File

@ -142,6 +142,7 @@ export async function scrapSingleUrl(
break; break;
} }
let cleanedHtml = removeUnwantedElements(text, pageOptions); let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text]; return [await parseMarkdown(cleanedHtml), text];
}; };