From 4f5cacca468727b66ebdd31c0d2870a25377619a Mon Sep 17 00:00:00 2001 From: Hikari Haru Date: Fri, 17 Mar 2023 13:10:07 +0100 Subject: [PATCH] Fix office file indexing (#230) --- models/index_model.py | 7 ++++++- pyproject.toml | 3 ++- requirements.txt | 1 + requirements_base.txt | 3 ++- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/models/index_model.py b/models/index_model.py index 855481b..e30e032 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -381,7 +381,12 @@ class Index_handler: "application/json": ".json", "image/png": ".png", "image/": ".jpg", - "vnd.": ".pptx", + "ms-powerpoint": ".ppt", + "presentationml.presentation": ".pptx", + "ms-excel": ".xls", + "spreadsheetml.sheet": ".xlsx", + "msword": ".doc", + "wordprocessingml.document": ".docx", "audio/": ".mp3", "video/": ".mp4", "epub": ".epub", diff --git a/pyproject.toml b/pyproject.toml index 515307b..b19344a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,8 @@ dependencies = [ "python-pptx==0.6.21", "langchain==0.0.105", "unidecode==1.3.6", -"tqdm==4.64.1" +"tqdm==4.64.1", +"docx2txt==0.8" ] dynamic = ["version"] diff --git a/requirements.txt b/requirements.txt index 08a6410..db526f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ langchain==0.0.105 openai-whisper unidecode==1.3.6 tqdm==4.64.1 +docx2txt==0.8 diff --git a/requirements_base.txt b/requirements_base.txt index 1e6d9b6..d73968a 100644 --- a/requirements_base.txt +++ b/requirements_base.txt @@ -20,4 +20,5 @@ protobuf==3.20.2 python-pptx==0.6.21 langchain==0.0.105 unidecode==1.3.6 -tqdm==4.64.1 \ No newline at end of file +tqdm==4.64.1 +docx2txt==0.8 \ No newline at end of file