0

Merge pull request #158 from mendableai/nsc/docx-support

feat: Docx Support
This commit is contained in:
Nicolas 2024-05-16 12:03:16 -07:00 committed by GitHub
commit 5c1e6d188c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 182 additions and 13 deletions

View File

@ -33,6 +33,7 @@
"express": "^4.18.2",
"jest": "^29.6.3",
"jest-fetch-mock": "^3.0.3",
"mammoth": "^1.7.2",
"nodemon": "^2.0.20",
"supabase": "^1.77.9",
"supertest": "^6.3.3",

View File

@ -97,7 +97,7 @@ dependencies:
version: 0.0.25
langchain:
specifier: ^0.1.25
version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
languagedetect:
specifier: ^2.0.0
version: 2.0.0
@ -214,6 +214,9 @@ devDependencies:
jest-fetch-mock:
specifier: ^3.0.3
version: 3.0.3
mammoth:
specifier: ^1.7.2
version: 1.7.2
nodemon:
specifier: ^2.0.20
version: 2.0.22
@ -1765,6 +1768,10 @@ packages:
dev: false
optional: true
/@xmldom/xmldom@0.8.10:
resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==}
engines: {node: '>=10.0.0'}
/abbrev@1.1.1:
resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==}
dev: true
@ -1895,7 +1902,6 @@ packages:
resolution: {integrity: sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==}
dependencies:
sprintf-js: 1.0.3
dev: true
/argparse@2.0.1:
resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==}
@ -2071,7 +2077,6 @@ packages:
/base64-js@1.5.1:
resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
dev: false
/basic-ftp@5.0.5:
resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==}
@ -2096,6 +2101,9 @@ packages:
resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==}
dev: false
/bluebird@3.4.7:
resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==}
/body-parser@1.20.2:
resolution: {integrity: sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==}
engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16}
@ -2421,6 +2429,9 @@ packages:
resolution: {integrity: sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==}
dev: true
/core-util-is@1.0.3:
resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==}
/cors@2.8.5:
resolution: {integrity: sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==}
engines: {node: '>= 0.10'}
@ -2659,6 +2670,9 @@ packages:
md5: 2.3.0
dev: false
/dingbat-to-unicode@1.0.1:
resolution: {integrity: sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==}
/dom-serializer@2.0.0:
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
dependencies:
@ -2695,6 +2709,11 @@ packages:
engines: {node: '>=12'}
dev: false
/duck@0.1.12:
resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==}
dependencies:
underscore: 1.13.6
/eastasianwidth@0.2.0:
resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==}
dev: false
@ -3332,6 +3351,9 @@ packages:
resolution: {integrity: sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==}
dev: true
/immediate@3.0.6:
resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==}
/import-fresh@3.3.0:
resolution: {integrity: sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==}
engines: {node: '>=6'}
@ -3462,6 +3484,9 @@ packages:
engines: {node: '>=8'}
dev: true
/isarray@1.0.0:
resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==}
/isexe@2.0.0:
resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==}
@ -4049,6 +4074,14 @@ packages:
engines: {node: '>=0.10.0'}
dev: false
/jszip@3.10.1:
resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==}
dependencies:
lie: 3.3.0
pako: 1.0.11
readable-stream: 2.3.8
setimmediate: 1.0.5
/kareem@2.5.1:
resolution: {integrity: sha512-7jFxRVm+jD+rkq3kY0iZDJfsO2/t4BBPeEb2qKn2lR/9KhuksYk5hxzfRYWMPV8P/x2d0kHD306YyWLzjjH+uA==}
engines: {node: '>=12.0.0'}
@ -4064,7 +4097,7 @@ packages:
engines: {node: '>=6'}
dev: true
/langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
/langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==}
engines: {node: '>=18'}
peerDependencies:
@ -4238,6 +4271,7 @@ packages:
jsonpointer: 5.0.1
langchainhub: 0.0.8
langsmith: 0.1.13
mammoth: 1.7.2
ml-distance: 4.0.1
openapi-types: 12.1.3
p-retry: 4.6.2
@ -4344,6 +4378,11 @@ packages:
type-check: 0.3.2
dev: false
/lie@3.3.0:
resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==}
dependencies:
immediate: 3.0.6
/lines-and-columns@1.2.4:
resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==}
@ -4380,6 +4419,13 @@ packages:
- encoding
dev: false
/lop@0.4.1:
resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==}
dependencies:
duck: 0.1.12
option: 0.2.4
underscore: 1.13.6
/lru-cache@10.2.0:
resolution: {integrity: sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==}
engines: {node: 14 || >=16.14}
@ -4423,6 +4469,22 @@ packages:
tmpl: 1.0.5
dev: true
/mammoth@1.7.2:
resolution: {integrity: sha512-MqWU2hcLf1I5QMKyAbfJCvrLxnv5WztrAQyorfZ+WPq7Hk82vZFmvfR2/64ajIPpM4jlq0TXp1xZvp/FFaL1Ug==}
engines: {node: '>=12.0.0'}
hasBin: true
dependencies:
'@xmldom/xmldom': 0.8.10
argparse: 1.0.10
base64-js: 1.5.1
bluebird: 3.4.7
dingbat-to-unicode: 1.0.1
jszip: 3.10.1
lop: 0.4.1
path-is-absolute: 1.0.1
underscore: 1.13.6
xmlbuilder: 10.1.1
/md5@2.3.0:
resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==}
dependencies:
@ -4867,6 +4929,9 @@ packages:
resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==}
dev: false
/option@0.2.4:
resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==}
/optionator@0.8.3:
resolution: {integrity: sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==}
engines: {node: '>= 0.8.0'}
@ -4957,6 +5022,9 @@ packages:
netmask: 2.0.2
dev: false
/pako@1.0.11:
resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
/parent-module@1.0.1:
resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
engines: {node: '>=6'}
@ -5002,7 +5070,6 @@ packages:
/path-is-absolute@1.0.1:
resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==}
engines: {node: '>=0.10.0'}
dev: true
/path-key@3.1.1:
resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==}
@ -5095,6 +5162,9 @@ packages:
react-is: 18.2.0
dev: true
/process-nextick-args@2.0.1:
resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==}
/progress@2.0.3:
resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==}
engines: {node: '>=0.4.0'}
@ -5251,6 +5321,17 @@ packages:
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
dev: true
/readable-stream@2.3.8:
resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==}
dependencies:
core-util-is: 1.0.3
inherits: 2.0.4
isarray: 1.0.0
process-nextick-args: 2.0.1
safe-buffer: 5.1.2
string_decoder: 1.1.1
util-deprecate: 1.0.2
/readdirp@3.6.0:
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
engines: {node: '>=8.10.0'}
@ -5347,6 +5428,9 @@ packages:
resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==}
dev: false
/safe-buffer@5.1.2:
resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==}
/safe-buffer@5.2.1:
resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
@ -5460,6 +5544,9 @@ packages:
gopd: 1.0.1
has-property-descriptors: 1.0.2
/setimmediate@1.0.5:
resolution: {integrity: sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==}
/setprototypeof@1.2.0:
resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==}
@ -5562,7 +5649,6 @@ packages:
/sprintf-js@1.0.3:
resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==}
dev: true
/sprintf-js@1.1.3:
resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==}
@ -5631,6 +5717,11 @@ packages:
strip-ansi: 7.1.0
dev: false
/string_decoder@1.1.1:
resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==}
dependencies:
safe-buffer: 5.1.2
/strip-ansi@6.0.1:
resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
engines: {node: '>=8'}
@ -5975,7 +6066,6 @@ packages:
/underscore@1.13.6:
resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==}
dev: false
/undici-types@5.26.5:
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
@ -6022,6 +6112,9 @@ packages:
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
dev: false
/util-deprecate@1.0.2:
resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
/utils-merge@1.0.1:
resolution: {integrity: sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==}
engines: {node: '>= 0.4.0'}
@ -6182,6 +6275,10 @@ packages:
xmlbuilder: 11.0.1
dev: false
/xmlbuilder@10.1.1:
resolution: {integrity: sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==}
engines: {node: '>=4.0'}
/xmlbuilder@11.0.1:
resolution: {integrity: sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==}
engines: {node: '>=4.0'}

View File

@ -321,7 +321,7 @@ export class WebCrawler {
".mp4",
".mp3",
".pptx",
".docx",
// ".docx",
".xlsx",
".xml",
];

View File

@ -17,6 +17,7 @@ import {
} from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor";
export class WebScraperDataProvider {
private bullJobId: string;
@ -157,7 +158,7 @@ export class WebScraperDataProvider {
private async handleCrawlMode(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
const crawler = new WebCrawler({
initialUrl: this.urls[0],
includes: this.includes,
@ -237,9 +238,13 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> {
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
links = links.filter((link) => !link.endsWith(".pdf"));
const pdfLinks = links.filter(link => link.endsWith(".pdf"));
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
const docxDocuments = await this.fetchDocxDocuments(docLinks);
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
let documents = await this.convertUrlsToDocuments(
links,
@ -257,7 +262,7 @@ export class WebScraperDataProvider {
) {
documents = await generateCompletions(documents, this.extractorOptions);
}
return documents.concat(pdfDocuments);
return documents.concat(pdfDocuments).concat(docxDocuments);
}
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
@ -272,6 +277,18 @@ export class WebScraperDataProvider {
})
);
}
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all(
docxLinks.map(async (p) => {
const docXDocument = await fetchAndProcessDocx(p);
return {
content: docXDocument,
metadata: { sourceURL: p },
provider: "web-scraper",
};
})
);
}
private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths

View File

@ -0,0 +1,13 @@
import * as docxProcessor from "../docxProcessor";
describe("DOCX Processing Module - Integration Test", () => {
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
delete process.env.LLAMAPARSE_API_KEY;
const docxContent = await docxProcessor.fetchAndProcessDocx(
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
);
expect(docxContent.trim()).toContain(
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
);
});
});

View File

@ -0,0 +1,41 @@
import axios from "axios";
import fs from "fs";
import { createWriteStream } from "node:fs";
import path from "path";
import os from "os";
import mammoth from "mammoth";
export async function fetchAndProcessDocx(url: string): Promise<string> {
const tempFilePath = await downloadDocx(url);
const content = await processDocxToText(tempFilePath);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content;
}
async function downloadDocx(url: string): Promise<string> {
const response = await axios({
url,
method: "GET",
responseType: "stream",
});
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
const writer = createWriteStream(tempFilePath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on("finish", () => resolve(tempFilePath));
writer.on("error", reject);
});
}
export async function processDocxToText(filePath: string): Promise<string> {
const content = await extractTextFromDocx(filePath);
return content;
}
async function extractTextFromDocx(filePath: string): Promise<string> {
const result = await mammoth.extractRawText({ path: filePath });
return result.value;
}