async gpt-index changes

Kaveen Kumarasinghe 2 years ago
parent b06604a1b3
commit e5648eb469

@ -13,3 +13,5 @@ Index Compositions:
Indexes can be combined with other indexes through a composition. To combine indexes, you can run the `/index compose` command, and select the indexes that you want to combine together. You should only combine relevant indexes together, combining irrelevant indexes together will result in poor results (for example, don't upload a math textbook and then upload a large set of poems and combine them together). When creating a composition, you will be given the option to do a "Deep" composition, deep compositions are more detailed and will give you better results, but are incredibly costly and will sometimes take multiple minutes to compose. Indexes can be combined with other indexes through a composition. To combine indexes, you can run the `/index compose` command, and select the indexes that you want to combine together. You should only combine relevant indexes together, combining irrelevant indexes together will result in poor results (for example, don't upload a math textbook and then upload a large set of poems and combine them together). When creating a composition, you will be given the option to do a "Deep" composition, deep compositions are more detailed and will give you better results, but are incredibly costly and will sometimes take multiple minutes to compose.
You can also compose a singular index with itself with "Deep Compose", this will give you a more detailed version of the index, but will be costly and will sometimes take multiple minutes to compose. **Deep compositions are useless for very short documents!** You can also compose a singular index with itself with "Deep Compose", this will give you a more detailed version of the index, but will be costly and will sometimes take multiple minutes to compose. **Deep compositions are useless for very short documents!**
**When doing Deep Compositions, it's highly reccomended to keep the document size small, or only do deep compositions on single documents.** This is because a deep composition reorganizes the simple index into a tree structure and uses GPT3 to summarize different nodes of the tree, which will lead to high costs. For example, a deep composition of a 300 page lab manual and the contents of my personal website at https://kaveenk.com cost me $2 USD roughly.

@ -1,3 +1,4 @@
import functools
import os import os
import random import random
import tempfile import tempfile
@ -55,9 +56,10 @@ def get_and_query(
if isinstance(index, GPTTreeIndex): if isinstance(index, GPTTreeIndex):
response = index.query( response = index.query(
query, query,
child_branch_factor=2, child_branch_factor=1,
llm_predictor=llm_predictor, llm_predictor=llm_predictor,
embed_model=embed_model, embed_model=embed_model,
use_async=True,
) )
else: else:
response = index.query( response = index.query(
@ -66,6 +68,7 @@ def get_and_query(
llm_predictor=llm_predictor, llm_predictor=llm_predictor,
embed_model=embed_model, embed_model=embed_model,
similarity_top_k=nodes, similarity_top_k=nodes,
use_async=True,
) )
return response return response
@ -166,15 +169,14 @@ class Index_handler:
return pages return pages
# TODO We need to do predictions below for token usage.
def index_file(self, file_path, embed_model) -> GPTSimpleVectorIndex: def index_file(self, file_path, embed_model) -> GPTSimpleVectorIndex:
document = SimpleDirectoryReader(file_path).load_data() document = SimpleDirectoryReader(file_path).load_data()
index = GPTSimpleVectorIndex(document, embed_model=embed_model) index = GPTSimpleVectorIndex(document, embed_model=embed_model, use_async=True)
return index return index
def index_gdoc(self, doc_id, embed_model) -> GPTSimpleVectorIndex: def index_gdoc(self, doc_id, embed_model) -> GPTSimpleVectorIndex:
document = GoogleDocsReader().load_data(doc_id) document = GoogleDocsReader().load_data(doc_id)
index = GPTSimpleVectorIndex(document, embed_model=embed_model) index = GPTSimpleVectorIndex(document, embed_model=embed_model, use_async=True)
return index return index
def index_youtube_transcript(self, link, embed_model): def index_youtube_transcript(self, link, embed_model):
@ -182,6 +184,7 @@ class Index_handler:
index = GPTSimpleVectorIndex( index = GPTSimpleVectorIndex(
documents, documents,
embed_model=embed_model, embed_model=embed_model,
use_async=True,
) )
return index return index
@ -203,6 +206,7 @@ class Index_handler:
index = GPTSimpleVectorIndex( index = GPTSimpleVectorIndex(
documents, documents,
embed_model=embed_model, embed_model=embed_model,
use_async=True,
) )
return index return index
@ -217,6 +221,7 @@ class Index_handler:
index = GPTSimpleVectorIndex( index = GPTSimpleVectorIndex(
document, document,
embed_model=embed_model, embed_model=embed_model,
use_async=True,
) )
return index return index
@ -253,10 +258,16 @@ class Index_handler:
# Detect if the link is a PDF, if it is, we load it differently # Detect if the link is a PDF, if it is, we load it differently
if response.headers["Content-Type"] == "application/pdf": if response.headers["Content-Type"] == "application/pdf":
documents = await self.index_pdf(url) documents = await self.index_pdf(url)
index = GPTSimpleVectorIndex( index = await self.loop.run_in_executor(
documents, None,
embed_model=embed_model, functools.partial(
GPTSimpleVectorIndex,
documents=documents,
embed_model=embed_model,
use_async=True,
),
) )
return index return index
except: except:
raise ValueError("Could not load webpage") raise ValueError("Could not load webpage")
@ -264,7 +275,16 @@ class Index_handler:
documents = BeautifulSoupWebReader( documents = BeautifulSoupWebReader(
website_extractor=DEFAULT_WEBSITE_EXTRACTOR website_extractor=DEFAULT_WEBSITE_EXTRACTOR
).load_data(urls=[url]) ).load_data(urls=[url])
index = GPTSimpleVectorIndex(documents, embed_model=embed_model) # index = GPTSimpleVectorIndex(documents, embed_model=embed_model, use_async=True)
index = await self.loop.run_in_executor(
None,
functools.partial(
GPTSimpleVectorIndex,
documents=documents,
embed_model=embed_model,
use_async=True,
),
)
return index return index
def reset_indexes(self, user_id): def reset_indexes(self, user_id):
@ -446,13 +466,16 @@ class Index_handler:
traceback.print_exc() traceback.print_exc()
await ctx.respond(e) await ctx.respond(e)
async def index_to_docs(
async def index_to_docs(self, old_index, chunk_size:int = 4000, chunk_overlap:int = 200) -> List[Document]: self, old_index, chunk_size: int = 4000, chunk_overlap: int = 200
) -> List[Document]:
documents = [] documents = []
for doc_id in old_index.docstore.docs.keys(): for doc_id in old_index.docstore.docs.keys():
text = "" text = ""
if isinstance(old_index, GPTSimpleVectorIndex): if isinstance(old_index, GPTSimpleVectorIndex):
nodes = old_index.docstore.get_document(doc_id).get_nodes(old_index.docstore.docs[doc_id].id_map) nodes = old_index.docstore.get_document(doc_id).get_nodes(
old_index.docstore.docs[doc_id].id_map
)
for node in nodes: for node in nodes:
extra_info = node.extra_info extra_info = node.extra_info
text += f"{node.text} " text += f"{node.text} "
@ -461,14 +484,15 @@ class Index_handler:
for node in nodes: for node in nodes:
extra_info = node[1].extra_info extra_info = node[1].extra_info
text += f"{node[1].text} " text += f"{node[1].text} "
text_splitter = TokenTextSplitter(separator=" ", chunk_size=chunk_size, chunk_overlap=chunk_overlap) text_splitter = TokenTextSplitter(
separator=" ", chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
text_chunks = text_splitter.split_text(text) text_chunks = text_splitter.split_text(text)
for text in text_chunks: for text in text_chunks:
document = Document(text, extra_info=extra_info) document = Document(text, extra_info=extra_info)
documents.append(document) documents.append(document)
return documents return documents
async def compose_indexes(self, user_id, indexes, name, deep_compose): async def compose_indexes(self, user_id, indexes, name, deep_compose):
# Load all the indexes first # Load all the indexes first
index_objects = [] index_objects = []
@ -496,6 +520,7 @@ class Index_handler:
documents=documents, documents=documents,
llm_predictor=llm_predictor, llm_predictor=llm_predictor,
embed_model=embedding_model, embed_model=embedding_model,
use_async=True,
), ),
) )
@ -527,6 +552,7 @@ class Index_handler:
GPTSimpleVectorIndex, GPTSimpleVectorIndex,
documents=documents, documents=documents,
embed_model=embedding_model, embed_model=embedding_model,
use_async=True,
), ),
) )

Loading…
Cancel
Save