From d43de493a26242098a6d6c0e70fb99c42e6c76e8 Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Fri, 24 Feb 2023 19:01:12 +0000 Subject: [PATCH] Add index_to_doc function --- models/index_model.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/models/index_model.py b/models/index_model.py index 870c470..a7eec2b 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -18,6 +18,7 @@ from langchain import OpenAI from gpt_index.readers import YoutubeTranscriptReader from gpt_index.readers.schema.base import Document +from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter from gpt_index import ( GPTSimpleVectorIndex, @@ -445,6 +446,24 @@ class Index_handler: traceback.print_exc() await ctx.respond(e) + + async def index_to_docs(self, old_index) -> List[Document]: + documents = [] + for doc_id in old_index.docstore.docs.keys(): + extra_info = "" + text = "" + nodes = old_index.docstore.get_document(doc_id).get_nodes(old_index.docstore.docs[doc_id].id_map) + for node in nodes: + extra_info = node.extra_info + text += f"{node.text} " + text_splitter = TokenTextSplitter(separator=" ", chunk_size=2048, chunk_overlap=20) + text_chunks = text_splitter.split_text(text) + for text in text_chunks: + document = Document(text, extra_info=extra_info) + documents.append(document) + return documents + + async def compose_indexes(self, user_id, indexes, name, deep_compose): # Load all the indexes first index_objects = [] @@ -459,11 +478,7 @@ class Index_handler: if deep_compose: documents = [] for _index in index_objects: - [ - documents.append(_index.docstore.get_document(doc_id)) - for doc_id in [docmeta for docmeta in _index.docstore.docs.keys()] - if isinstance(_index.docstore.get_document(doc_id), Document) - ] + documents.extend(await self.index_to_docs(_index)) llm_predictor = LLMPredictor( llm=OpenAI(model_name="text-davinci-003", max_tokens=-1) ) @@ -497,11 +512,7 @@ class Index_handler: else: documents = [] for _index in index_objects: - [ - documents.append(_index.docstore.get_document(doc_id)) - for doc_id in [docmeta for docmeta in _index.docstore.docs.keys()] - if isinstance(_index.docstore.get_document(doc_id), Document) - ] + documents.extend(await self.index_to_docs(_index)) embedding_model = OpenAIEmbedding()