From a2a79afb18d9089aa3176ac69356f4dd5f29876d Mon Sep 17 00:00:00 2001
From: Kaveen Kumarasinghe <ocfinancesmc@gmail.com>
Date: Sun, 5 Feb 2023 17:57:28 -0500
Subject: [PATCH] search improvements

---
 models/search_model.py | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/models/search_model.py b/models/search_model.py
index 28214ba..ac6c756 100644
--- a/models/search_model.py
+++ b/models/search_model.py
@@ -47,7 +47,7 @@ class Search:
         ).load_data(urls=[url])
         return documents
 
-    async def get_links(self, query, search_scope=2):
+    async def get_links(self, query, search_scope=3):
         """Search the web for a query"""
         async with aiohttp.ClientSession() as session:
             async with session.get(
@@ -56,7 +56,7 @@ class Search:
                 if response.status == 200:
                     data = await response.json()
                     # Return a list of the top 5 links
-                    return [item["link"] for item in data["items"][:search_scope]]
+                    return [item["link"] for item in data["items"][:search_scope]], [item["link"] for item in data["items"]]
                 else:
                     return "An error occurred while searching."
 
@@ -68,7 +68,7 @@ class Search:
             os.environ["OPENAI_API_KEY"] = user_api_key
 
         # Get the links for the query
-        links = await self.get_links(query, search_scope=search_scope)
+        links, all_links = await self.get_links(query, search_scope=search_scope)
 
         # For each link, crawl the page and get all the text that's not HTML garbage.
         # Concatenate all the text for a given website into one string and save it into an array:
@@ -78,10 +78,35 @@ class Search:
             # continue to the document loading.
             try:
                 async with aiohttp.ClientSession() as session:
-                    async with session.get(link, timeout=3) as response:
-                        pass  # Only catch timeout errors, allow for redirects for now..
+                    async with session.get(link, timeout=2) as response:
+                        # Add another entry to links from all_links if the link is not already in it to compensate for the failed request
+                        if response.status not in [200, 203, 202, 204]:
+                            for link2 in all_links:
+                                if link2 not in links:
+                                    print("Found a replacement link")
+                                    links.append(link2)
+                                    break
+                            continue
+                        # Follow redirects
+                        elif response.status in [301, 302, 303, 307, 308]:
+                            try:
+                                print("Adding redirect")
+                                links.append(response.url)
+                            except:
+                                pass
+
             except:
                 traceback.print_exc()
+                try:
+                    # Try to add a link from all_links, this is kind of messy.
+                    for link2 in all_links:
+                        if link2 not in links:
+                            print("Found a replacement link")
+                            links.append(link2)
+                            break
+                except:
+                    pass
+
                 continue
 
             try:
@@ -96,7 +121,7 @@ class Search:
 
         embedding_model = OpenAIEmbedding()
         index = GPTSimpleVectorIndex(documents, embed_model=embedding_model)
-        await self.usage_service.update_usage(embedding_model.last_token_usage)
+        await self.usage_service.update_usage(embedding_model.last_token_usage, embeddings=True)
 
         llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003"))
         # Now we can search the index for a query: