search improvements

2 years ago · a2a79afb18
parent 1d2a2c8eae
commit a2a79afb18
1 changed files with 31 additions and 6 deletions
--- a/models/search_model.py
+++ b/models/search_model.py
@ -47,7 +47,7 @@ class Search:
        ).load_data(urls=[url])
        return documents

-    async def get_links(self, query, search_scope=2):
+    async def get_links(self, query, search_scope=3):
        """Search the web for a query"""
        async with aiohttp.ClientSession() as session:
            async with session.get(
@ -56,7 +56,7 @@ class Search:
                if response.status == 200:
                    data = await response.json()
                    # Return a list of the top 5 links
-                    return [item["link"] for item in data["items"][:search_scope]]
+                    return [item["link"] for item in data["items"][:search_scope]], [item["link"] for item in data["items"]]
                else:
                    return "An error occurred while searching."

@ -68,7 +68,7 @@ class Search:
            os.environ["OPENAI_API_KEY"] = user_api_key

        # Get the links for the query
-        links = await self.get_links(query, search_scope=search_scope)
+        links, all_links = await self.get_links(query, search_scope=search_scope)

        # For each link, crawl the page and get all the text that's not HTML garbage.
        # Concatenate all the text for a given website into one string and save it into an array:
@ -78,10 +78,35 @@ class Search:
            # continue to the document loading.
            try:
                async with aiohttp.ClientSession() as session:
-                    async with session.get(link, timeout=3) as response:
-                        pass  # Only catch timeout errors, allow for redirects for now..
+                    async with session.get(link, timeout=2) as response:
+                        # Add another entry to links from all_links if the link is not already in it to compensate for the failed request
+                        if response.status not in [200, 203, 202, 204]:
+                            for link2 in all_links:
+                                if link2 not in links:
+                                    print("Found a replacement link")
+                                    links.append(link2)
+                                    break
+                            continue
+                        # Follow redirects
+                        elif response.status in [301, 302, 303, 307, 308]:
+                            try:
+                                print("Adding redirect")
+                                links.append(response.url)
+                            except:
+                                pass
+
            except:
                traceback.print_exc()
+                try:
+                    # Try to add a link from all_links, this is kind of messy.
+                    for link2 in all_links:
+                        if link2 not in links:
+                            print("Found a replacement link")
+                            links.append(link2)
+                            break
+                except:
+                    pass
+
                continue

            try:
@ -96,7 +121,7 @@ class Search:

        embedding_model = OpenAIEmbedding()
        index = GPTSimpleVectorIndex(documents, embed_model=embedding_model)
-        await self.usage_service.update_usage(embedding_model.last_token_usage)
+        await self.usage_service.update_usage(embedding_model.last_token_usage, embeddings=True)

        llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003"))
        # Now we can search the index for a query: