Show progress for search, fix index loading for PDFs from URls

2 years ago · b9e4eae8d3
parent 60e11ebcaa
commit b9e4eae8d3
4 changed files with 126 additions and 28 deletions
--- a/cogs/search_service_cog.py
+++ b/cogs/search_service_cog.py
@ -50,7 +50,7 @@ class SearchService(discord.Cog, name="SearchService"):
                first = True
            else:
                page = discord.Embed(
-                    title=f"Page {count}",
+                    title=f"Search Results",
                    description=chunk,
                )
            pages.append(page)
@ -73,13 +73,13 @@ class SearchService(discord.Cog, name="SearchService"):
            not EnvService.get_google_search_api_key()
            or not EnvService.get_google_search_engine_id()
        ):
-            await ctx.send("The search service is not enabled.")
+            await ctx.respond("The search service is not enabled.")
            return

        await ctx.defer()

        try:
-            response = await self.model.search(query, user_api_key, search_scope, nodes)
+            response = await self.model.search(ctx, query, user_api_key, search_scope, nodes)
        except ValueError:
            await ctx.respond(
                "The Google Search API returned an error. Check the console for more details.",
--- a/gpt3discord.py
+++ b/gpt3discord.py
@ -31,7 +31,7 @@ from services.environment_service import EnvService
 from models.openai_model import Model


-__version__ = "10.2.3"
+__version__ = "10.2.5"


 PID_FILE = Path("bot.pid")
--- a/models/index_model.py
+++ b/models/index_model.py
@ -169,22 +169,6 @@ class Index_handler:
        index = GPTSimpleVectorIndex(document, embed_model=embed_model)
        return index

-    async def index_web_pdf(self, url, embed_model) -> GPTSimpleVectorIndex:
-        print("Indexing a WEB PDF")
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url) as response:
-                if response.status == 200:
-                    data = await response.read()
-                    f = tempfile.NamedTemporaryFile(delete=False)
-                    f.write(data)
-                    f.close()
-                else:
-                    return "An error occurred while downloading the PDF."
-
-        document = SimpleDirectoryReader(input_files=[f.name]).load_data()
-        index = GPTSimpleVectorIndex(document, embed_model=embed_model)
-        return index
-
    def index_gdoc(self, doc_id, embed_model) -> GPTSimpleVectorIndex:
        document = GoogleDocsReader().load_data(doc_id)
        index = GPTSimpleVectorIndex(document, embed_model=embed_model)
@ -212,7 +196,46 @@ class Index_handler:
        )
        return index

-    def index_webpage(self, url, embed_model) -> GPTSimpleVectorIndex:
+    async def index_pdf(self, url) -> list[Document]:
+        # Download the PDF at the url and save it to a tempfile
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                if response.status == 200:
+                    data = await response.read()
+                    f = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
+                    f.write(data)
+                    f.close()
+                else:
+                    return "An error occurred while downloading the PDF."
+        # Get the file path of this tempfile.NamedTemporaryFile
+        # Save this temp file to an actual file that we can put into something else to read it
+        documents = SimpleDirectoryReader(input_files=[f.name]).load_data()
+        print("Loaded the PDF document data")
+
+        # Delete the temporary file
+        return documents
+
+    async def index_webpage(self, url, embed_model) -> GPTSimpleVectorIndex:
+
+        # First try to connect to the URL to see if we can even reach it.
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url, timeout=5) as response:
+                    # Add another entry to links from all_links if the link is not already in it to compensate for the failed request
+                    if response.status not in [200, 203, 202, 204]:
+                        raise ValueError("Invalid URL or could not connect to the provided URL.")
+                    else:
+                        # Detect if the link is a PDF, if it is, we load it differently
+                        if response.headers["Content-Type"] == "application/pdf":
+                            documents = await self.index_pdf(url)
+                            index = GPTSimpleVectorIndex(
+                                documents,
+                                embed_model=embed_model,
+                            )
+                            return index
+        except:
+            raise ValueError("Could not load webpage")
+
        documents = BeautifulSoupWebReader(
            website_extractor=DEFAULT_WEBSITE_EXTRACTOR
        ).load_data(urls=[url])
@ -308,12 +331,8 @@ class Index_handler:
                index = await self.loop.run_in_executor(
                    None, partial(self.index_youtube_transcript, link, embedding_model)
                )
-            elif "pdf" in content_type:
-                index = await self.index_web_pdf(link, embedding_model)
            else:
-                index = await self.loop.run_in_executor(
-                    None, partial(self.index_webpage, link, embedding_model)
-                )
+                index = await self.index_webpage(link, embedding_model)
            await self.usage_service.update_usage(
                embedding_model.last_token_usage, embeddings=True
            )
--- a/models/search_model.py
+++ b/models/search_model.py
@ -45,6 +45,52 @@ class Search:
        self.openai_key = os.getenv("OPENAI_TOKEN")
        self.EMBED_CUTOFF = 2000

+    def build_search_started_embed(self):
+        embed = discord.Embed(
+            title="Searching the web...",
+            description="Refining google search query...",
+            color=0x00FF00,
+        )
+        return embed
+
+    def build_search_refined_embed(self, refined_query):
+        embed = discord.Embed(
+            title="Searching the web...",
+            description="Refined query: " + refined_query + "\n\nRetrieving links from google...",
+            color=0x00FF00,
+        )
+        return embed
+
+    def build_search_links_retrieved_embed(self, refined_query):
+        embed = discord.Embed(
+            title="Searching the web...",
+            description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n"
+                                                                              "Retrieving webpages...",
+            color=0x00FF00,
+        )
+        return embed
+
+    def build_search_webpages_retrieved_embed(self, refined_query):
+        embed = discord.Embed(
+            title="Searching the web...",
+            description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n"
+                                                                              "Retrieved webpages\n\n"
+                                                                              "Indexing...",
+            color=0x00FF00,
+        )
+        return embed
+
+    def build_search_indexed_embed(self, refined_query):
+        embed = discord.Embed(
+            title="Searching the web...",
+            description="Refined query: " + refined_query + "\n\nRetrieved links from Google\n\n"
+                                                                              "Retrieved webpages\n\n"
+                                                                              "Indexed\n\n"
+                                                                              "Thinking about your question...",
+            color=0x00FF00,
+        )
+        return embed
+
    def index_webpage(self, url) -> list[Document]:
        documents = BeautifulSoupWebReader(
            website_extractor=DEFAULT_WEBSITE_EXTRACTOR
@ -90,13 +136,30 @@ class Search:
                    )
                    return ["An error occurred while searching.", None]

-    async def search(self, query, user_api_key, search_scope, nodes):
+    async def try_edit(self, message, embed):
+        try:
+            await message.edit(embed=embed)
+        except Exception:
+            traceback.print_exc()
+            pass
+
+    async def try_delete(self, message):
+        try:
+            await message.delete()
+        except Exception:
+            traceback.print_exc()
+            pass
+
+    async def search(self,ctx: discord.ApplicationContext, query, user_api_key, search_scope, nodes):
        DEFAULT_SEARCH_NODES = 1
        if not user_api_key:
            os.environ["OPENAI_API_KEY"] = self.openai_key
        else:
            os.environ["OPENAI_API_KEY"] = user_api_key

+        if ctx:
+            in_progress_message = await ctx.respond(embed=self.build_search_started_embed())
+
        llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003"))
        try:
            llm_predictor_presearch = OpenAI(
@ -116,11 +179,18 @@ class Search:
            traceback.print_exc()
            query_refined_text = query

+        if ctx:
+            await self.try_edit(in_progress_message,self.build_search_refined_embed(query_refined_text))
+
+
        # Get the links for the query
-        print("The refined search is " + query_refined_text)
        links, all_links = await self.get_links(
            query_refined_text, search_scope=search_scope
        )
+
+        if ctx:
+            await self.try_edit(in_progress_message,self.build_search_links_retrieved_embed(query_refined_text))
+
        if all_links is None:
            raise ValueError("The Google Search API returned an error.")

@ -181,12 +251,18 @@ class Search:
            except Exception as e:
                traceback.print_exc()

+        if ctx:
+            await self.try_edit(in_progress_message,self.build_search_webpages_retrieved_embed(query_refined_text))
+
        embedding_model = OpenAIEmbedding()

        index = await self.loop.run_in_executor(
            None, partial(GPTSimpleVectorIndex, documents, embed_model=embedding_model)
        )

+        if ctx:
+            await self.try_edit(in_progress_message,self.build_search_indexed_embed(query_refined_text))
+
        await self.usage_service.update_usage(
            embedding_model.last_token_usage, embeddings=True
        )
@ -216,4 +292,7 @@ class Search:
            embedding_model.last_token_usage, embeddings=True
        )

+        if ctx:
+            await self.try_delete(in_progress_message)
+
        return response