GPT3Discord/models/search_model.py

import random
import re
from bs4 import BeautifulSoup
import aiohttp

from services.environment_service import EnvService
from services.usage_service import UsageService


class Search:
    def __init__(self, gpt_model, pinecone_service):
        self.model = gpt_model
        self.pinecone_service = pinecone_service
        self.google_search_api_key = EnvService.get_google_search_api_key()
        self.google_search_engine_id = EnvService.get_google_search_engine_id()

    async def get_links(self, query):
        """Search the web for a query"""
        async with aiohttp.ClientSession() as session:
            async with session.get(
                f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"
            ) as response:
                if response.status == 200:
                    data = await response.json()
                    # Return a list of the top 5 links
                    return [item["link"] for item in data["items"][:5]]
                else:
                    return "An error occurred while searching."

    async def search(self, query):
        # Get the links for the query
        links = await self.get_links(query)

        # For each link, crawl the page and get all the text that's not HTML garbage.
        # Concatenate all the text for a given website into one string and save it into an array:
        texts = []
        for link in links:
            async with aiohttp.ClientSession() as session:
                async with session.get(link, timeout=5) as response:
                    if response.status == 200:
                        soup = BeautifulSoup(await response.read(), "html.parser")
                        # Find all the content between <p> tags and join them together and then append to texts
                        texts.append(" ".join([p.text for p in soup.find_all("p")]))
                    else:
                        pass
        print("Finished retrieving text content from the links")

        # For each text in texts, split it up into 500 character chunks and create embeddings for it
        # The pinecone service uses conversation_id, but we can use it here too to keep track of the "search", each
        # conversation_id represents a unique search.
        conversation_id = random.randint(0, 100000000)
        for text in texts:
            # Split the text into 150 character chunks without using re
            chunks = [text[i : i + 500] for i in range(0, len(text), 500)]
            # Create embeddings for each chunk
            for chunk in chunks:
                # Create an embedding for the chunk
                embedding = await self.model.send_embedding_request(chunk)
                # Upsert the embedding for the conversation ID
                self.pinecone_service.upsert_conversation_embedding(
                    self.model, conversation_id, chunk, 0
                )
        print("Finished creating embeddings for the text")

        # Now that we have all the embeddings for the search, we can embed the query and then
        # query pinecone for the top 5 results
        query_embedding = await self.model.send_embedding_request(query)
        results = self.pinecone_service.get_n_similar(
            conversation_id, query_embedding, n=3
        )
        # Get only the first elements of each result
        results = [result[0] for result in results]

        # Construct a query for GPT3 to use these results to answer the query
        GPT_QUERY = f"This is a search query. I want to know the answer to the query: {query}. Here are some results from the web: {[str(result) for result in results]}. \n\n Answer:"
        # Generate the answer
        # Use the tokenizer to determine token amount of the query
        await self.model.send_request(
            GPT_QUERY, UsageService.count_tokens_static(GPT_QUERY)
        )

        print(texts)
Add Paraphrase 2 years ago			`import random`
			`import re`
			`from bs4 import BeautifulSoup`
			`import aiohttp`

			`from services.environment_service import EnvService`
			`from services.usage_service import UsageService`


			`class Search:`
			`def __init__(self, gpt_model, pinecone_service):`
			`self.model = gpt_model`
			`self.pinecone_service = pinecone_service`
			`self.google_search_api_key = EnvService.get_google_search_api_key()`
			`self.google_search_engine_id = EnvService.get_google_search_engine_id()`

			`async def get_links(self, query):`
			`"""Search the web for a query"""`
			`async with aiohttp.ClientSession() as session:`
			`async with session.get(`
			`f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"`
			`) as response:`
			`if response.status == 200:`
			`data = await response.json()`
			`# Return a list of the top 5 links`
			`return [item["link"] for item in data["items"][:5]]`
			`else:`
			`return "An error occurred while searching."`

			`async def search(self, query):`
			`# Get the links for the query`
			`links = await self.get_links(query)`

			`# For each link, crawl the page and get all the text that's not HTML garbage.`
			`# Concatenate all the text for a given website into one string and save it into an array:`
			`texts = []`
			`for link in links:`
			`async with aiohttp.ClientSession() as session:`
			`async with session.get(link, timeout=5) as response:`
			`if response.status == 200:`
			`soup = BeautifulSoup(await response.read(), "html.parser")`
			`# Find all the content between <p> tags and join them together and then append to texts`
			`texts.append(" ".join([p.text for p in soup.find_all("p")]))`
			`else:`
			`pass`
			`print("Finished retrieving text content from the links")`

			`# For each text in texts, split it up into 500 character chunks and create embeddings for it`
			`# The pinecone service uses conversation_id, but we can use it here too to keep track of the "search", each`
			`# conversation_id represents a unique search.`
			`conversation_id = random.randint(0, 100000000)`
			`for text in texts:`
			`# Split the text into 150 character chunks without using re`
			`chunks = [text[i : i + 500] for i in range(0, len(text), 500)]`
			`# Create embeddings for each chunk`
			`for chunk in chunks:`
			`# Create an embedding for the chunk`
			`embedding = await self.model.send_embedding_request(chunk)`
			`# Upsert the embedding for the conversation ID`
Format Python code with psf/black push 2 years ago			`self.pinecone_service.upsert_conversation_embedding(`
			`self.model, conversation_id, chunk, 0`
			`)`
Add Paraphrase 2 years ago			`print("Finished creating embeddings for the text")`

			`# Now that we have all the embeddings for the search, we can embed the query and then`
			`# query pinecone for the top 5 results`
			`query_embedding = await self.model.send_embedding_request(query)`
Format Python code with psf/black push 2 years ago			`results = self.pinecone_service.get_n_similar(`
			`conversation_id, query_embedding, n=3`
			`)`
Add Paraphrase 2 years ago			`# Get only the first elements of each result`
			`results = [result[0] for result in results]`

			`# Construct a query for GPT3 to use these results to answer the query`
			`GPT_QUERY = f"This is a search query. I want to know the answer to the query: {query}. Here are some results from the web: {[str(result) for result in results]}. \n\n Answer:"`
			`# Generate the answer`
			`# Use the tokenizer to determine token amount of the query`
Format Python code with psf/black push 2 years ago			`await self.model.send_request(`
			`GPT_QUERY, UsageService.count_tokens_static(GPT_QUERY)`
			`)`
Add Paraphrase 2 years ago
Format Python code with psf/black push 2 years ago			`print(texts)`