From 9d635cb2a3d21041da1cc624251601422b3ff75b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 16 May 2024 11:48:02 -0700 Subject: [PATCH] Nick: docx support --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 111 ++++++++++++++++-- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 27 ++++- .../utils/__tests__/docxProcessor.test.ts | 13 ++ .../scraper/WebScraper/utils/docxProcessor.ts | 41 +++++++ 6 files changed, 182 insertions(+), 13 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/docxProcessor.ts diff --git a/apps/api/package.json b/apps/api/package.json index a79e3dc..ad99c5e 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -33,6 +33,7 @@ "express": "^4.18.2", "jest": "^29.6.3", "jest-fetch-mock": "^3.0.3", + "mammoth": "^1.7.2", "nodemon": "^2.0.20", "supabase": "^1.77.9", "supertest": "^6.3.3", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 7873375..16b2f6c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -97,7 +97,7 @@ dependencies: version: 0.0.25 langchain: specifier: ^0.1.25 - version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) + version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -214,6 +214,9 @@ devDependencies: jest-fetch-mock: specifier: ^3.0.3 version: 3.0.3 + mammoth: + specifier: ^1.7.2 + version: 1.7.2 nodemon: specifier: ^2.0.20 version: 2.0.22 @@ -1765,6 +1768,10 @@ packages: dev: false optional: true + /@xmldom/xmldom@0.8.10: + resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} + engines: {node: '>=10.0.0'} + /abbrev@1.1.1: resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} dev: true @@ -1895,7 +1902,6 @@ packages: resolution: {integrity: sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==} dependencies: sprintf-js: 1.0.3 - dev: true /argparse@2.0.1: resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} @@ -2071,7 +2077,6 @@ packages: /base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} - dev: false /basic-ftp@5.0.5: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} @@ -2096,6 +2101,9 @@ packages: resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==} dev: false + /bluebird@3.4.7: + resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} + /body-parser@1.20.2: resolution: {integrity: sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==} engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16} @@ -2421,6 +2429,9 @@ packages: resolution: {integrity: sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==} dev: true + /core-util-is@1.0.3: + resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} + /cors@2.8.5: resolution: {integrity: sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==} engines: {node: '>= 0.10'} @@ -2659,6 +2670,9 @@ packages: md5: 2.3.0 dev: false + /dingbat-to-unicode@1.0.1: + resolution: {integrity: sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==} + /dom-serializer@2.0.0: resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} dependencies: @@ -2695,6 +2709,11 @@ packages: engines: {node: '>=12'} dev: false + /duck@0.1.12: + resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==} + dependencies: + underscore: 1.13.6 + /eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} dev: false @@ -3332,6 +3351,9 @@ packages: resolution: {integrity: sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==} dev: true + /immediate@3.0.6: + resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==} + /import-fresh@3.3.0: resolution: {integrity: sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==} engines: {node: '>=6'} @@ -3462,6 +3484,9 @@ packages: engines: {node: '>=8'} dev: true + /isarray@1.0.0: + resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==} + /isexe@2.0.0: resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} @@ -4049,6 +4074,14 @@ packages: engines: {node: '>=0.10.0'} dev: false + /jszip@3.10.1: + resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} + dependencies: + lie: 3.3.0 + pako: 1.0.11 + readable-stream: 2.3.8 + setimmediate: 1.0.5 + /kareem@2.5.1: resolution: {integrity: sha512-7jFxRVm+jD+rkq3kY0iZDJfsO2/t4BBPeEb2qKn2lR/9KhuksYk5hxzfRYWMPV8P/x2d0kHD306YyWLzjjH+uA==} engines: {node: '>=12.0.0'} @@ -4064,7 +4097,7 @@ packages: engines: {node: '>=6'} dev: true - /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): + /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==} engines: {node: '>=18'} peerDependencies: @@ -4238,6 +4271,7 @@ packages: jsonpointer: 5.0.1 langchainhub: 0.0.8 langsmith: 0.1.13 + mammoth: 1.7.2 ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -4344,6 +4378,11 @@ packages: type-check: 0.3.2 dev: false + /lie@3.3.0: + resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==} + dependencies: + immediate: 3.0.6 + /lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} @@ -4380,6 +4419,13 @@ packages: - encoding dev: false + /lop@0.4.1: + resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==} + dependencies: + duck: 0.1.12 + option: 0.2.4 + underscore: 1.13.6 + /lru-cache@10.2.0: resolution: {integrity: sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==} engines: {node: 14 || >=16.14} @@ -4423,6 +4469,22 @@ packages: tmpl: 1.0.5 dev: true + /mammoth@1.7.2: + resolution: {integrity: sha512-MqWU2hcLf1I5QMKyAbfJCvrLxnv5WztrAQyorfZ+WPq7Hk82vZFmvfR2/64ajIPpM4jlq0TXp1xZvp/FFaL1Ug==} + engines: {node: '>=12.0.0'} + hasBin: true + dependencies: + '@xmldom/xmldom': 0.8.10 + argparse: 1.0.10 + base64-js: 1.5.1 + bluebird: 3.4.7 + dingbat-to-unicode: 1.0.1 + jszip: 3.10.1 + lop: 0.4.1 + path-is-absolute: 1.0.1 + underscore: 1.13.6 + xmlbuilder: 10.1.1 + /md5@2.3.0: resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==} dependencies: @@ -4867,6 +4929,9 @@ packages: resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} dev: false + /option@0.2.4: + resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==} + /optionator@0.8.3: resolution: {integrity: sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==} engines: {node: '>= 0.8.0'} @@ -4957,6 +5022,9 @@ packages: netmask: 2.0.2 dev: false + /pako@1.0.11: + resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} + /parent-module@1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} @@ -5002,7 +5070,6 @@ packages: /path-is-absolute@1.0.1: resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==} engines: {node: '>=0.10.0'} - dev: true /path-key@3.1.1: resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} @@ -5095,6 +5162,9 @@ packages: react-is: 18.2.0 dev: true + /process-nextick-args@2.0.1: + resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} + /progress@2.0.3: resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==} engines: {node: '>=0.4.0'} @@ -5251,6 +5321,17 @@ packages: engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} dev: true + /readable-stream@2.3.8: + resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==} + dependencies: + core-util-is: 1.0.3 + inherits: 2.0.4 + isarray: 1.0.0 + process-nextick-args: 2.0.1 + safe-buffer: 5.1.2 + string_decoder: 1.1.1 + util-deprecate: 1.0.2 + /readdirp@3.6.0: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} engines: {node: '>=8.10.0'} @@ -5347,6 +5428,9 @@ packages: resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==} dev: false + /safe-buffer@5.1.2: + resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} + /safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} @@ -5460,6 +5544,9 @@ packages: gopd: 1.0.1 has-property-descriptors: 1.0.2 + /setimmediate@1.0.5: + resolution: {integrity: sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==} + /setprototypeof@1.2.0: resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==} @@ -5562,7 +5649,6 @@ packages: /sprintf-js@1.0.3: resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==} - dev: true /sprintf-js@1.1.3: resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==} @@ -5631,6 +5717,11 @@ packages: strip-ansi: 7.1.0 dev: false + /string_decoder@1.1.1: + resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==} + dependencies: + safe-buffer: 5.1.2 + /strip-ansi@6.0.1: resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} engines: {node: '>=8'} @@ -5975,7 +6066,6 @@ packages: /underscore@1.13.6: resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==} - dev: false /undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} @@ -6022,6 +6112,9 @@ packages: resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} dev: false + /util-deprecate@1.0.2: + resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} + /utils-merge@1.0.1: resolution: {integrity: sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==} engines: {node: '>= 0.4.0'} @@ -6182,6 +6275,10 @@ packages: xmlbuilder: 11.0.1 dev: false + /xmlbuilder@10.1.1: + resolution: {integrity: sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==} + engines: {node: '>=4.0'} + /xmlbuilder@11.0.1: resolution: {integrity: sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==} engines: {node: '>=4.0'} diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 9e080d7..f53ef22 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -321,7 +321,7 @@ export class WebCrawler { ".mp4", ".mp3", ".pptx", - ".docx", + // ".docx", ".xlsx", ".xml", ]; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index a0f719a..d244993 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -17,6 +17,7 @@ import { } from "./utils/replacePaths"; import { generateCompletions } from "../../lib/LLM-extraction"; import { getWebScraperQueue } from "../../../src/services/queue-service"; +import { fetchAndProcessDocx } from "./utils/docxProcessor"; export class WebScraperDataProvider { private bullJobId: string; @@ -157,7 +158,7 @@ export class WebScraperDataProvider { private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { - + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -237,9 +238,13 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void, allHtmls?: string[] ): Promise { - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); - let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - links = links.filter((link) => !link.endsWith(".pdf")); + const pdfLinks = links.filter(link => link.endsWith(".pdf")); + const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx")); + + const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + const docxDocuments = await this.fetchDocxDocuments(docLinks); + + links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); let documents = await this.convertUrlsToDocuments( links, @@ -257,7 +262,7 @@ export class WebScraperDataProvider { ) { documents = await generateCompletions(documents, this.extractorOptions); } - return documents.concat(pdfDocuments); + return documents.concat(pdfDocuments).concat(docxDocuments); } private async fetchPdfDocuments(pdfLinks: string[]): Promise { @@ -272,6 +277,18 @@ export class WebScraperDataProvider { }) ); } + private async fetchDocxDocuments(docxLinks: string[]): Promise { + return Promise.all( + docxLinks.map(async (p) => { + const docXDocument = await fetchAndProcessDocx(p); + return { + content: docXDocument, + metadata: { sourceURL: p }, + provider: "web-scraper", + }; + }) + ); + } private applyPathReplacements(documents: Document[]): Document[] { return this.replaceAllPathsWithAbsolutePaths diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts new file mode 100644 index 0000000..e018ffa --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts @@ -0,0 +1,13 @@ +import * as docxProcessor from "../docxProcessor"; + +describe("DOCX Processing Module - Integration Test", () => { + it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => { + delete process.env.LLAMAPARSE_API_KEY; + const docxContent = await docxProcessor.fetchAndProcessDocx( + "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx" + ); + expect(docxContent.trim()).toContain( + "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" + ); + }); +}); diff --git a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts new file mode 100644 index 0000000..38759f8 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts @@ -0,0 +1,41 @@ +import axios from "axios"; +import fs from "fs"; +import { createWriteStream } from "node:fs"; +import path from "path"; +import os from "os"; +import mammoth from "mammoth"; + +export async function fetchAndProcessDocx(url: string): Promise { + const tempFilePath = await downloadDocx(url); + const content = await processDocxToText(tempFilePath); + fs.unlinkSync(tempFilePath); // Clean up the temporary file + return content; +} + +async function downloadDocx(url: string): Promise { + const response = await axios({ + url, + method: "GET", + responseType: "stream", + }); + + const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`); + const writer = createWriteStream(tempFilePath); + + response.data.pipe(writer); + + return new Promise((resolve, reject) => { + writer.on("finish", () => resolve(tempFilePath)); + writer.on("error", reject); + }); +} + +export async function processDocxToText(filePath: string): Promise { + const content = await extractTextFromDocx(filePath); + return content; +} + +async function extractTextFromDocx(filePath: string): Promise { + const result = await mammoth.extractRawText({ path: filePath }); + return result.value; +}