Merge pull request #158 from mendableai/nsc/docx-support
feat: Docx Support
This commit is contained in:
commit
5c1e6d188c
@ -33,6 +33,7 @@
|
||||
"express": "^4.18.2",
|
||||
"jest": "^29.6.3",
|
||||
"jest-fetch-mock": "^3.0.3",
|
||||
"mammoth": "^1.7.2",
|
||||
"nodemon": "^2.0.20",
|
||||
"supabase": "^1.77.9",
|
||||
"supertest": "^6.3.3",
|
||||
|
@ -97,7 +97,7 @@ dependencies:
|
||||
version: 0.0.25
|
||||
langchain:
|
||||
specifier: ^0.1.25
|
||||
version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
|
||||
version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
|
||||
languagedetect:
|
||||
specifier: ^2.0.0
|
||||
version: 2.0.0
|
||||
@ -214,6 +214,9 @@ devDependencies:
|
||||
jest-fetch-mock:
|
||||
specifier: ^3.0.3
|
||||
version: 3.0.3
|
||||
mammoth:
|
||||
specifier: ^1.7.2
|
||||
version: 1.7.2
|
||||
nodemon:
|
||||
specifier: ^2.0.20
|
||||
version: 2.0.22
|
||||
@ -1765,6 +1768,10 @@ packages:
|
||||
dev: false
|
||||
optional: true
|
||||
|
||||
/@xmldom/xmldom@0.8.10:
|
||||
resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==}
|
||||
engines: {node: '>=10.0.0'}
|
||||
|
||||
/abbrev@1.1.1:
|
||||
resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==}
|
||||
dev: true
|
||||
@ -1895,7 +1902,6 @@ packages:
|
||||
resolution: {integrity: sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==}
|
||||
dependencies:
|
||||
sprintf-js: 1.0.3
|
||||
dev: true
|
||||
|
||||
/argparse@2.0.1:
|
||||
resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==}
|
||||
@ -2071,7 +2077,6 @@ packages:
|
||||
|
||||
/base64-js@1.5.1:
|
||||
resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
|
||||
dev: false
|
||||
|
||||
/basic-ftp@5.0.5:
|
||||
resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==}
|
||||
@ -2096,6 +2101,9 @@ packages:
|
||||
resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==}
|
||||
dev: false
|
||||
|
||||
/bluebird@3.4.7:
|
||||
resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==}
|
||||
|
||||
/body-parser@1.20.2:
|
||||
resolution: {integrity: sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==}
|
||||
engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16}
|
||||
@ -2421,6 +2429,9 @@ packages:
|
||||
resolution: {integrity: sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==}
|
||||
dev: true
|
||||
|
||||
/core-util-is@1.0.3:
|
||||
resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==}
|
||||
|
||||
/cors@2.8.5:
|
||||
resolution: {integrity: sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==}
|
||||
engines: {node: '>= 0.10'}
|
||||
@ -2659,6 +2670,9 @@ packages:
|
||||
md5: 2.3.0
|
||||
dev: false
|
||||
|
||||
/dingbat-to-unicode@1.0.1:
|
||||
resolution: {integrity: sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==}
|
||||
|
||||
/dom-serializer@2.0.0:
|
||||
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
||||
dependencies:
|
||||
@ -2695,6 +2709,11 @@ packages:
|
||||
engines: {node: '>=12'}
|
||||
dev: false
|
||||
|
||||
/duck@0.1.12:
|
||||
resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==}
|
||||
dependencies:
|
||||
underscore: 1.13.6
|
||||
|
||||
/eastasianwidth@0.2.0:
|
||||
resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==}
|
||||
dev: false
|
||||
@ -3332,6 +3351,9 @@ packages:
|
||||
resolution: {integrity: sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==}
|
||||
dev: true
|
||||
|
||||
/immediate@3.0.6:
|
||||
resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==}
|
||||
|
||||
/import-fresh@3.3.0:
|
||||
resolution: {integrity: sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==}
|
||||
engines: {node: '>=6'}
|
||||
@ -3462,6 +3484,9 @@ packages:
|
||||
engines: {node: '>=8'}
|
||||
dev: true
|
||||
|
||||
/isarray@1.0.0:
|
||||
resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==}
|
||||
|
||||
/isexe@2.0.0:
|
||||
resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==}
|
||||
|
||||
@ -4049,6 +4074,14 @@ packages:
|
||||
engines: {node: '>=0.10.0'}
|
||||
dev: false
|
||||
|
||||
/jszip@3.10.1:
|
||||
resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==}
|
||||
dependencies:
|
||||
lie: 3.3.0
|
||||
pako: 1.0.11
|
||||
readable-stream: 2.3.8
|
||||
setimmediate: 1.0.5
|
||||
|
||||
/kareem@2.5.1:
|
||||
resolution: {integrity: sha512-7jFxRVm+jD+rkq3kY0iZDJfsO2/t4BBPeEb2qKn2lR/9KhuksYk5hxzfRYWMPV8P/x2d0kHD306YyWLzjjH+uA==}
|
||||
engines: {node: '>=12.0.0'}
|
||||
@ -4064,7 +4097,7 @@ packages:
|
||||
engines: {node: '>=6'}
|
||||
dev: true
|
||||
|
||||
/langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
|
||||
/langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
|
||||
resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
@ -4238,6 +4271,7 @@ packages:
|
||||
jsonpointer: 5.0.1
|
||||
langchainhub: 0.0.8
|
||||
langsmith: 0.1.13
|
||||
mammoth: 1.7.2
|
||||
ml-distance: 4.0.1
|
||||
openapi-types: 12.1.3
|
||||
p-retry: 4.6.2
|
||||
@ -4344,6 +4378,11 @@ packages:
|
||||
type-check: 0.3.2
|
||||
dev: false
|
||||
|
||||
/lie@3.3.0:
|
||||
resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==}
|
||||
dependencies:
|
||||
immediate: 3.0.6
|
||||
|
||||
/lines-and-columns@1.2.4:
|
||||
resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==}
|
||||
|
||||
@ -4380,6 +4419,13 @@ packages:
|
||||
- encoding
|
||||
dev: false
|
||||
|
||||
/lop@0.4.1:
|
||||
resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==}
|
||||
dependencies:
|
||||
duck: 0.1.12
|
||||
option: 0.2.4
|
||||
underscore: 1.13.6
|
||||
|
||||
/lru-cache@10.2.0:
|
||||
resolution: {integrity: sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==}
|
||||
engines: {node: 14 || >=16.14}
|
||||
@ -4423,6 +4469,22 @@ packages:
|
||||
tmpl: 1.0.5
|
||||
dev: true
|
||||
|
||||
/mammoth@1.7.2:
|
||||
resolution: {integrity: sha512-MqWU2hcLf1I5QMKyAbfJCvrLxnv5WztrAQyorfZ+WPq7Hk82vZFmvfR2/64ajIPpM4jlq0TXp1xZvp/FFaL1Ug==}
|
||||
engines: {node: '>=12.0.0'}
|
||||
hasBin: true
|
||||
dependencies:
|
||||
'@xmldom/xmldom': 0.8.10
|
||||
argparse: 1.0.10
|
||||
base64-js: 1.5.1
|
||||
bluebird: 3.4.7
|
||||
dingbat-to-unicode: 1.0.1
|
||||
jszip: 3.10.1
|
||||
lop: 0.4.1
|
||||
path-is-absolute: 1.0.1
|
||||
underscore: 1.13.6
|
||||
xmlbuilder: 10.1.1
|
||||
|
||||
/md5@2.3.0:
|
||||
resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==}
|
||||
dependencies:
|
||||
@ -4867,6 +4929,9 @@ packages:
|
||||
resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==}
|
||||
dev: false
|
||||
|
||||
/option@0.2.4:
|
||||
resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==}
|
||||
|
||||
/optionator@0.8.3:
|
||||
resolution: {integrity: sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==}
|
||||
engines: {node: '>= 0.8.0'}
|
||||
@ -4957,6 +5022,9 @@ packages:
|
||||
netmask: 2.0.2
|
||||
dev: false
|
||||
|
||||
/pako@1.0.11:
|
||||
resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
|
||||
|
||||
/parent-module@1.0.1:
|
||||
resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
|
||||
engines: {node: '>=6'}
|
||||
@ -5002,7 +5070,6 @@ packages:
|
||||
/path-is-absolute@1.0.1:
|
||||
resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
dev: true
|
||||
|
||||
/path-key@3.1.1:
|
||||
resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==}
|
||||
@ -5095,6 +5162,9 @@ packages:
|
||||
react-is: 18.2.0
|
||||
dev: true
|
||||
|
||||
/process-nextick-args@2.0.1:
|
||||
resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==}
|
||||
|
||||
/progress@2.0.3:
|
||||
resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==}
|
||||
engines: {node: '>=0.4.0'}
|
||||
@ -5251,6 +5321,17 @@ packages:
|
||||
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
|
||||
dev: true
|
||||
|
||||
/readable-stream@2.3.8:
|
||||
resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==}
|
||||
dependencies:
|
||||
core-util-is: 1.0.3
|
||||
inherits: 2.0.4
|
||||
isarray: 1.0.0
|
||||
process-nextick-args: 2.0.1
|
||||
safe-buffer: 5.1.2
|
||||
string_decoder: 1.1.1
|
||||
util-deprecate: 1.0.2
|
||||
|
||||
/readdirp@3.6.0:
|
||||
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
|
||||
engines: {node: '>=8.10.0'}
|
||||
@ -5347,6 +5428,9 @@ packages:
|
||||
resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==}
|
||||
dev: false
|
||||
|
||||
/safe-buffer@5.1.2:
|
||||
resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==}
|
||||
|
||||
/safe-buffer@5.2.1:
|
||||
resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
|
||||
|
||||
@ -5460,6 +5544,9 @@ packages:
|
||||
gopd: 1.0.1
|
||||
has-property-descriptors: 1.0.2
|
||||
|
||||
/setimmediate@1.0.5:
|
||||
resolution: {integrity: sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==}
|
||||
|
||||
/setprototypeof@1.2.0:
|
||||
resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==}
|
||||
|
||||
@ -5562,7 +5649,6 @@ packages:
|
||||
|
||||
/sprintf-js@1.0.3:
|
||||
resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==}
|
||||
dev: true
|
||||
|
||||
/sprintf-js@1.1.3:
|
||||
resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==}
|
||||
@ -5631,6 +5717,11 @@ packages:
|
||||
strip-ansi: 7.1.0
|
||||
dev: false
|
||||
|
||||
/string_decoder@1.1.1:
|
||||
resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==}
|
||||
dependencies:
|
||||
safe-buffer: 5.1.2
|
||||
|
||||
/strip-ansi@6.0.1:
|
||||
resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
|
||||
engines: {node: '>=8'}
|
||||
@ -5975,7 +6066,6 @@ packages:
|
||||
|
||||
/underscore@1.13.6:
|
||||
resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==}
|
||||
dev: false
|
||||
|
||||
/undici-types@5.26.5:
|
||||
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
|
||||
@ -6022,6 +6112,9 @@ packages:
|
||||
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
||||
dev: false
|
||||
|
||||
/util-deprecate@1.0.2:
|
||||
resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
|
||||
|
||||
/utils-merge@1.0.1:
|
||||
resolution: {integrity: sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==}
|
||||
engines: {node: '>= 0.4.0'}
|
||||
@ -6182,6 +6275,10 @@ packages:
|
||||
xmlbuilder: 11.0.1
|
||||
dev: false
|
||||
|
||||
/xmlbuilder@10.1.1:
|
||||
resolution: {integrity: sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==}
|
||||
engines: {node: '>=4.0'}
|
||||
|
||||
/xmlbuilder@11.0.1:
|
||||
resolution: {integrity: sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==}
|
||||
engines: {node: '>=4.0'}
|
||||
|
@ -321,7 +321,7 @@ export class WebCrawler {
|
||||
".mp4",
|
||||
".mp3",
|
||||
".pptx",
|
||||
".docx",
|
||||
// ".docx",
|
||||
".xlsx",
|
||||
".xml",
|
||||
];
|
||||
|
@ -17,6 +17,7 @@ import {
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private bullJobId: string;
|
||||
@ -157,7 +158,7 @@ export class WebScraperDataProvider {
|
||||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
|
||||
|
||||
const crawler = new WebCrawler({
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
@ -237,9 +238,13 @@ export class WebScraperDataProvider {
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
const pdfLinks = links.filter(link => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
|
||||
|
||||
const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
const docxDocuments = await this.fetchDocxDocuments(docLinks);
|
||||
|
||||
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
links,
|
||||
@ -257,7 +262,7 @@ export class WebScraperDataProvider {
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions);
|
||||
}
|
||||
return documents.concat(pdfDocuments);
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
@ -272,6 +277,18 @@ export class WebScraperDataProvider {
|
||||
})
|
||||
);
|
||||
}
|
||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (p) => {
|
||||
const docXDocument = await fetchAndProcessDocx(p);
|
||||
return {
|
||||
content: docXDocument,
|
||||
metadata: { sourceURL: p },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
return this.replaceAllPathsWithAbsolutePaths
|
||||
|
@ -0,0 +1,13 @@
|
||||
import * as docxProcessor from "../docxProcessor";
|
||||
|
||||
describe("DOCX Processing Module - Integration Test", () => {
|
||||
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const docxContent = await docxProcessor.fetchAndProcessDocx(
|
||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
||||
);
|
||||
expect(docxContent.trim()).toContain(
|
||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
||||
);
|
||||
});
|
||||
});
|
41
apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
Normal file
41
apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
Normal file
@ -0,0 +1,41 @@
|
||||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import { createWriteStream } from "node:fs";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function fetchAndProcessDocx(url: string): Promise<string> {
|
||||
const tempFilePath = await downloadDocx(url);
|
||||
const content = await processDocxToText(tempFilePath);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return content;
|
||||
}
|
||||
|
||||
async function downloadDocx(url: string): Promise<string> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve(tempFilePath));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
export async function processDocxToText(filePath: string): Promise<string> {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
}
|
||||
|
||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
}
|
Loading…
Reference in New Issue
Block a user