diff --git a/cogs/search_service_cog.py b/cogs/search_service_cog.py index 977fa81..ca5a244 100644 --- a/cogs/search_service_cog.py +++ b/cogs/search_service_cog.py @@ -50,7 +50,7 @@ class SearchService(discord.Cog, name="SearchService"): first = True else: page = discord.Embed( - title=f"Page {count}", + title=f"Search Results", description=chunk, ) pages.append(page) @@ -73,13 +73,13 @@ class SearchService(discord.Cog, name="SearchService"): not EnvService.get_google_search_api_key() or not EnvService.get_google_search_engine_id() ): - await ctx.send("The search service is not enabled.") + await ctx.respond("The search service is not enabled.") return await ctx.defer() try: - response = await self.model.search(query, user_api_key, search_scope, nodes) + response = await self.model.search(ctx, query, user_api_key, search_scope, nodes) except ValueError: await ctx.respond( "The Google Search API returned an error. Check the console for more details.", diff --git a/gpt3discord.py b/gpt3discord.py index 64e4c5b..55d53d8 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -31,7 +31,7 @@ from services.environment_service import EnvService from models.openai_model import Model -__version__ = "10.2.3" +__version__ = "10.2.5" PID_FILE = Path("bot.pid") diff --git a/models/index_model.py b/models/index_model.py index dd59a49..c66200f 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -169,22 +169,6 @@ class Index_handler: index = GPTSimpleVectorIndex(document, embed_model=embed_model) return index - async def index_web_pdf(self, url, embed_model) -> GPTSimpleVectorIndex: - print("Indexing a WEB PDF") - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - if response.status == 200: - data = await response.read() - f = tempfile.NamedTemporaryFile(delete=False) - f.write(data) - f.close() - else: - return "An error occurred while downloading the PDF." - - document = SimpleDirectoryReader(input_files=[f.name]).load_data() - index = GPTSimpleVectorIndex(document, embed_model=embed_model) - return index - def index_gdoc(self, doc_id, embed_model) -> GPTSimpleVectorIndex: document = GoogleDocsReader().load_data(doc_id) index = GPTSimpleVectorIndex(document, embed_model=embed_model) @@ -212,7 +196,46 @@ class Index_handler: ) return index - def index_webpage(self, url, embed_model) -> GPTSimpleVectorIndex: + async def index_pdf(self, url) -> list[Document]: + # Download the PDF at the url and save it to a tempfile + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + if response.status == 200: + data = await response.read() + f = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) + f.write(data) + f.close() + else: + return "An error occurred while downloading the PDF." + # Get the file path of this tempfile.NamedTemporaryFile + # Save this temp file to an actual file that we can put into something else to read it + documents = SimpleDirectoryReader(input_files=[f.name]).load_data() + print("Loaded the PDF document data") + + # Delete the temporary file + return documents + + async def index_webpage(self, url, embed_model) -> GPTSimpleVectorIndex: + + # First try to connect to the URL to see if we can even reach it. + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=5) as response: + # Add another entry to links from all_links if the link is not already in it to compensate for the failed request + if response.status not in [200, 203, 202, 204]: + raise ValueError("Invalid URL or could not connect to the provided URL.") + else: + # Detect if the link is a PDF, if it is, we load it differently + if response.headers["Content-Type"] == "application/pdf": + documents = await self.index_pdf(url) + index = GPTSimpleVectorIndex( + documents, + embed_model=embed_model, + ) + return index + except: + raise ValueError("Could not load webpage") + documents = BeautifulSoupWebReader( website_extractor=DEFAULT_WEBSITE_EXTRACTOR ).load_data(urls=[url]) @@ -308,12 +331,8 @@ class Index_handler: index = await self.loop.run_in_executor( None, partial(self.index_youtube_transcript, link, embedding_model) ) - elif "pdf" in content_type: - index = await self.index_web_pdf(link, embedding_model) else: - index = await self.loop.run_in_executor( - None, partial(self.index_webpage, link, embedding_model) - ) + index = await self.index_webpage(link, embedding_model) await self.usage_service.update_usage( embedding_model.last_token_usage, embeddings=True ) diff --git a/models/search_model.py b/models/search_model.py index 9a36384..f7cc404 100644 --- a/models/search_model.py +++ b/models/search_model.py @@ -45,6 +45,52 @@ class Search: self.openai_key = os.getenv("OPENAI_TOKEN") self.EMBED_CUTOFF = 2000 + def build_search_started_embed(self): + embed = discord.Embed( + title="Searching the web...", + description="Refining google search query...", + color=0x00FF00, + ) + return embed + + def build_search_refined_embed(self, refined_query): + embed = discord.Embed( + title="Searching the web...", + description="Refined query: " + refined_query + "\n\nRetrieving links from google...", + color=0x00FF00, + ) + return embed + + def build_search_links_retrieved_embed(self, refined_query): + embed = discord.Embed( + title="Searching the web...", + description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n" + "Retrieving webpages...", + color=0x00FF00, + ) + return embed + + def build_search_webpages_retrieved_embed(self, refined_query): + embed = discord.Embed( + title="Searching the web...", + description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n" + "Retrieved webpages\n\n" + "Indexing...", + color=0x00FF00, + ) + return embed + + def build_search_indexed_embed(self, refined_query): + embed = discord.Embed( + title="Searching the web...", + description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n" + "Retrieved webpages\n\n" + "Indexed\n\n" + "Thinking about your question...", + color=0x00FF00, + ) + return embed + def index_webpage(self, url) -> list[Document]: documents = BeautifulSoupWebReader( website_extractor=DEFAULT_WEBSITE_EXTRACTOR @@ -90,13 +136,30 @@ class Search: ) return ["An error occurred while searching.", None] - async def search(self, query, user_api_key, search_scope, nodes): + async def try_edit(self, message, embed): + try: + await message.edit(embed=embed) + except Exception: + traceback.print_exc() + pass + + async def try_delete(self, message): + try: + await message.delete() + except Exception: + traceback.print_exc() + pass + + async def search(self,ctx: discord.ApplicationContext, query, user_api_key, search_scope, nodes): DEFAULT_SEARCH_NODES = 1 if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key else: os.environ["OPENAI_API_KEY"] = user_api_key + if ctx: + in_progress_message = await ctx.respond(embed=self.build_search_started_embed()) + llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003")) try: llm_predictor_presearch = OpenAI( @@ -116,11 +179,18 @@ class Search: traceback.print_exc() query_refined_text = query + if ctx: + await self.try_edit(in_progress_message,self.build_search_refined_embed(query_refined_text)) + + # Get the links for the query - print("The refined search is " + query_refined_text) links, all_links = await self.get_links( query_refined_text, search_scope=search_scope ) + + if ctx: + await self.try_edit(in_progress_message,self.build_search_links_retrieved_embed(query_refined_text)) + if all_links is None: raise ValueError("The Google Search API returned an error.") @@ -181,12 +251,18 @@ class Search: except Exception as e: traceback.print_exc() + if ctx: + await self.try_edit(in_progress_message,self.build_search_webpages_retrieved_embed(query_refined_text)) + embedding_model = OpenAIEmbedding() index = await self.loop.run_in_executor( None, partial(GPTSimpleVectorIndex, documents, embed_model=embedding_model) ) + if ctx: + await self.try_edit(in_progress_message,self.build_search_indexed_embed(query_refined_text)) + await self.usage_service.update_usage( embedding_model.last_token_usage, embeddings=True ) @@ -216,4 +292,7 @@ class Search: embedding_model.last_token_usage, embeddings=True ) + if ctx: + await self.try_delete(in_progress_message) + return response