Show progress for search, fix index loading for PDFs from URls

Kaveen Kumarasinghe 1 year ago
parent 60e11ebcaa
commit b9e4eae8d3

@ -50,7 +50,7 @@ class SearchService(discord.Cog, name="SearchService"):
first = True
else:
page = discord.Embed(
title=f"Page {count}",
title=f"Search Results",
description=chunk,
)
pages.append(page)
@ -73,13 +73,13 @@ class SearchService(discord.Cog, name="SearchService"):
not EnvService.get_google_search_api_key()
or not EnvService.get_google_search_engine_id()
):
await ctx.send("The search service is not enabled.")
await ctx.respond("The search service is not enabled.")
return
await ctx.defer()
try:
response = await self.model.search(query, user_api_key, search_scope, nodes)
response = await self.model.search(ctx, query, user_api_key, search_scope, nodes)
except ValueError:
await ctx.respond(
"The Google Search API returned an error. Check the console for more details.",

@ -31,7 +31,7 @@ from services.environment_service import EnvService
from models.openai_model import Model
__version__ = "10.2.3"
__version__ = "10.2.5"
PID_FILE = Path("bot.pid")

@ -169,22 +169,6 @@ class Index_handler:
index = GPTSimpleVectorIndex(document, embed_model=embed_model)
return index
async def index_web_pdf(self, url, embed_model) -> GPTSimpleVectorIndex:
print("Indexing a WEB PDF")
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
data = await response.read()
f = tempfile.NamedTemporaryFile(delete=False)
f.write(data)
f.close()
else:
return "An error occurred while downloading the PDF."
document = SimpleDirectoryReader(input_files=[f.name]).load_data()
index = GPTSimpleVectorIndex(document, embed_model=embed_model)
return index
def index_gdoc(self, doc_id, embed_model) -> GPTSimpleVectorIndex:
document = GoogleDocsReader().load_data(doc_id)
index = GPTSimpleVectorIndex(document, embed_model=embed_model)
@ -212,7 +196,46 @@ class Index_handler:
)
return index
def index_webpage(self, url, embed_model) -> GPTSimpleVectorIndex:
async def index_pdf(self, url) -> list[Document]:
# Download the PDF at the url and save it to a tempfile
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
data = await response.read()
f = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
f.write(data)
f.close()
else:
return "An error occurred while downloading the PDF."
# Get the file path of this tempfile.NamedTemporaryFile
# Save this temp file to an actual file that we can put into something else to read it
documents = SimpleDirectoryReader(input_files=[f.name]).load_data()
print("Loaded the PDF document data")
# Delete the temporary file
return documents
async def index_webpage(self, url, embed_model) -> GPTSimpleVectorIndex:
# First try to connect to the URL to see if we can even reach it.
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=5) as response:
# Add another entry to links from all_links if the link is not already in it to compensate for the failed request
if response.status not in [200, 203, 202, 204]:
raise ValueError("Invalid URL or could not connect to the provided URL.")
else:
# Detect if the link is a PDF, if it is, we load it differently
if response.headers["Content-Type"] == "application/pdf":
documents = await self.index_pdf(url)
index = GPTSimpleVectorIndex(
documents,
embed_model=embed_model,
)
return index
except:
raise ValueError("Could not load webpage")
documents = BeautifulSoupWebReader(
website_extractor=DEFAULT_WEBSITE_EXTRACTOR
).load_data(urls=[url])
@ -308,12 +331,8 @@ class Index_handler:
index = await self.loop.run_in_executor(
None, partial(self.index_youtube_transcript, link, embedding_model)
)
elif "pdf" in content_type:
index = await self.index_web_pdf(link, embedding_model)
else:
index = await self.loop.run_in_executor(
None, partial(self.index_webpage, link, embedding_model)
)
index = await self.index_webpage(link, embedding_model)
await self.usage_service.update_usage(
embedding_model.last_token_usage, embeddings=True
)

@ -45,6 +45,52 @@ class Search:
self.openai_key = os.getenv("OPENAI_TOKEN")
self.EMBED_CUTOFF = 2000
def build_search_started_embed(self):
embed = discord.Embed(
title="Searching the web...",
description="Refining google search query...",
color=0x00FF00,
)
return embed
def build_search_refined_embed(self, refined_query):
embed = discord.Embed(
title="Searching the web...",
description="Refined query: " + refined_query + "\n\nRetrieving links from google...",
color=0x00FF00,
)
return embed
def build_search_links_retrieved_embed(self, refined_query):
embed = discord.Embed(
title="Searching the web...",
description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n"
"Retrieving webpages...",
color=0x00FF00,
)
return embed
def build_search_webpages_retrieved_embed(self, refined_query):
embed = discord.Embed(
title="Searching the web...",
description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n"
"Retrieved webpages\n\n"
"Indexing...",
color=0x00FF00,
)
return embed
def build_search_indexed_embed(self, refined_query):
embed = discord.Embed(
title="Searching the web...",
description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n"
"Retrieved webpages\n\n"
"Indexed\n\n"
"Thinking about your question...",
color=0x00FF00,
)
return embed
def index_webpage(self, url) -> list[Document]:
documents = BeautifulSoupWebReader(
website_extractor=DEFAULT_WEBSITE_EXTRACTOR
@ -90,13 +136,30 @@ class Search:
)
return ["An error occurred while searching.", None]
async def search(self, query, user_api_key, search_scope, nodes):
async def try_edit(self, message, embed):
try:
await message.edit(embed=embed)
except Exception:
traceback.print_exc()
pass
async def try_delete(self, message):
try:
await message.delete()
except Exception:
traceback.print_exc()
pass
async def search(self,ctx: discord.ApplicationContext, query, user_api_key, search_scope, nodes):
DEFAULT_SEARCH_NODES = 1
if not user_api_key:
os.environ["OPENAI_API_KEY"] = self.openai_key
else:
os.environ["OPENAI_API_KEY"] = user_api_key
if ctx:
in_progress_message = await ctx.respond(embed=self.build_search_started_embed())
llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003"))
try:
llm_predictor_presearch = OpenAI(
@ -116,11 +179,18 @@ class Search:
traceback.print_exc()
query_refined_text = query
if ctx:
await self.try_edit(in_progress_message,self.build_search_refined_embed(query_refined_text))
# Get the links for the query
print("The refined search is " + query_refined_text)
links, all_links = await self.get_links(
query_refined_text, search_scope=search_scope
)
if ctx:
await self.try_edit(in_progress_message,self.build_search_links_retrieved_embed(query_refined_text))
if all_links is None:
raise ValueError("The Google Search API returned an error.")
@ -181,12 +251,18 @@ class Search:
except Exception as e:
traceback.print_exc()
if ctx:
await self.try_edit(in_progress_message,self.build_search_webpages_retrieved_embed(query_refined_text))
embedding_model = OpenAIEmbedding()
index = await self.loop.run_in_executor(
None, partial(GPTSimpleVectorIndex, documents, embed_model=embedding_model)
)
if ctx:
await self.try_edit(in_progress_message,self.build_search_indexed_embed(query_refined_text))
await self.usage_service.update_usage(
embedding_model.last_token_usage, embeddings=True
)
@ -216,4 +292,7 @@ class Search:
embedding_model.last_token_usage, embeddings=True
)
if ctx:
await self.try_delete(in_progress_message)
return response

Loading…
Cancel
Save