GPT3Discord/models/search_model.py

import random
import re
from bs4 import BeautifulSoup
import aiohttp

from services.environment_service import EnvService
from services.usage_service import UsageService


class Search:
    def __init__(self, gpt_model, pinecone_service):
        self.model = gpt_model
        self.pinecone_service = pinecone_service
        self.google_search_api_key = EnvService.get_google_search_api_key()
        self.google_search_engine_id = EnvService.get_google_search_engine_id()

    async def get_links(self, query):
        """Search the web for a query"""
        async with aiohttp.ClientSession() as session:
            async with session.get(
                f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"
            ) as response:
                if response.status == 200:
                    data = await response.json()
                    # Return a list of the top 5 links
                    return [item["link"] for item in data["items"][:5]]
                else:
                    return "An error occurred while searching."

    async def search(self, query):
        # Get the links for the query
        links = await self.get_links(query)

        # For each link, crawl the page and get all the text that's not HTML garbage.
        # Concatenate all the text for a given website into one string and save it into an array:
        texts = []
        for link in links:
            async with aiohttp.ClientSession() as session:
                async with session.get(link, timeout=5) as response:
                    if response.status == 200:
                        soup = BeautifulSoup(await response.read(), "html.parser")
                        # Find all the content between <p> tags and join them together and then append to texts
                        texts.append(" ".join([p.text for p in soup.find_all("p")]))
                    else:
                        pass
        print("Finished retrieving text content from the links")

        # For each text in texts, split it up into 500 character chunks and create embeddings for it
        # The pinecone service uses conversation_id, but we can use it here too to keep track of the "search", each
        # conversation_id represents a unique search.
        conversation_id = random.randint(0, 100000000)
        for text in texts:
            # Split the text into 150 character chunks without using re
            chunks = [text[i : i + 500] for i in range(0, len(text), 500)]
            # Create embeddings for each chunk
            for chunk in chunks:
                # Create an embedding for the chunk
                embedding = await self.model.send_embedding_request(chunk)
                # Upsert the embedding for the conversation ID
                self.pinecone_service.upsert_conversation_embedding(
                    self.model, conversation_id, chunk, 0
                )
        print("Finished creating embeddings for the text")

        # Now that we have all the embeddings for the search, we can embed the query and then
        # query pinecone for the top 5 results
        query_embedding = await self.model.send_embedding_request(query)
        results = self.pinecone_service.get_n_similar(
            conversation_id, query_embedding, n=3
        )
        # Get only the first elements of each result
        results = [result[0] for result in results]

        # Construct a query for GPT3 to use these results to answer the query
        GPT_QUERY = f"This is a search query. I want to know the answer to the query: {query}. Here are some results from the web: {[str(result) for result in results]}. \n\n Answer:"
        # Generate the answer
        # Use the tokenizer to determine token amount of the query
        await self.model.send_request(
            GPT_QUERY, UsageService.count_tokens_static(GPT_QUERY)
        )

        print(texts)