You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

83 lines
3.9 KiB

import random
import re
from bs4 import BeautifulSoup
import aiohttp
from services.environment_service import EnvService
from services.usage_service import UsageService
class Search:
def __init__(self, gpt_model, pinecone_service):
self.model = gpt_model
self.pinecone_service = pinecone_service
self.google_search_api_key = EnvService.get_google_search_api_key()
self.google_search_engine_id = EnvService.get_google_search_engine_id()
async def get_links(self, query):
"""Search the web for a query"""
async with aiohttp.ClientSession() as session:
async with session.get(
f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"
) as response:
if response.status == 200:
data = await response.json()
# Return a list of the top 5 links
return [item["link"] for item in data["items"][:5]]
else:
return "An error occurred while searching."
async def search(self, query):
# Get the links for the query
links = await self.get_links(query)
# For each link, crawl the page and get all the text that's not HTML garbage.
# Concatenate all the text for a given website into one string and save it into an array:
texts = []
for link in links:
async with aiohttp.ClientSession() as session:
async with session.get(link, timeout=5) as response:
if response.status == 200:
soup = BeautifulSoup(await response.read(), "html.parser")
# Find all the content between <p> tags and join them together and then append to texts
texts.append(" ".join([p.text for p in soup.find_all("p")]))
else:
pass
print("Finished retrieving text content from the links")
# For each text in texts, split it up into 500 character chunks and create embeddings for it
# The pinecone service uses conversation_id, but we can use it here too to keep track of the "search", each
# conversation_id represents a unique search.
conversation_id = random.randint(0, 100000000)
for text in texts:
# Split the text into 150 character chunks without using re
chunks = [text[i : i + 500] for i in range(0, len(text), 500)]
# Create embeddings for each chunk
for chunk in chunks:
# Create an embedding for the chunk
embedding = await self.model.send_embedding_request(chunk)
# Upsert the embedding for the conversation ID
self.pinecone_service.upsert_conversation_embedding(
self.model, conversation_id, chunk, 0
)
print("Finished creating embeddings for the text")
# Now that we have all the embeddings for the search, we can embed the query and then
# query pinecone for the top 5 results
query_embedding = await self.model.send_embedding_request(query)
results = self.pinecone_service.get_n_similar(
conversation_id, query_embedding, n=3
)
# Get only the first elements of each result
results = [result[0] for result in results]
# Construct a query for GPT3 to use these results to answer the query
GPT_QUERY = f"This is a search query. I want to know the answer to the query: {query}. Here are some results from the web: {[str(result) for result in results]}. \n\n Answer:"
# Generate the answer
# Use the tokenizer to determine token amount of the query
await self.model.send_request(
GPT_QUERY, UsageService.count_tokens_static(GPT_QUERY)
)
print(texts)