You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
3.9 KiB
83 lines
3.9 KiB
import random
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
import aiohttp
|
|
|
|
from services.environment_service import EnvService
|
|
from services.usage_service import UsageService
|
|
|
|
|
|
class Search:
|
|
def __init__(self, gpt_model, pinecone_service):
|
|
self.model = gpt_model
|
|
self.pinecone_service = pinecone_service
|
|
self.google_search_api_key = EnvService.get_google_search_api_key()
|
|
self.google_search_engine_id = EnvService.get_google_search_engine_id()
|
|
|
|
async def get_links(self, query):
|
|
"""Search the web for a query"""
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(
|
|
f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"
|
|
) as response:
|
|
if response.status == 200:
|
|
data = await response.json()
|
|
# Return a list of the top 5 links
|
|
return [item["link"] for item in data["items"][:5]]
|
|
else:
|
|
return "An error occurred while searching."
|
|
|
|
async def search(self, query):
|
|
# Get the links for the query
|
|
links = await self.get_links(query)
|
|
|
|
# For each link, crawl the page and get all the text that's not HTML garbage.
|
|
# Concatenate all the text for a given website into one string and save it into an array:
|
|
texts = []
|
|
for link in links:
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(link, timeout=5) as response:
|
|
if response.status == 200:
|
|
soup = BeautifulSoup(await response.read(), "html.parser")
|
|
# Find all the content between <p> tags and join them together and then append to texts
|
|
texts.append(" ".join([p.text for p in soup.find_all("p")]))
|
|
else:
|
|
pass
|
|
print("Finished retrieving text content from the links")
|
|
|
|
# For each text in texts, split it up into 500 character chunks and create embeddings for it
|
|
# The pinecone service uses conversation_id, but we can use it here too to keep track of the "search", each
|
|
# conversation_id represents a unique search.
|
|
conversation_id = random.randint(0, 100000000)
|
|
for text in texts:
|
|
# Split the text into 150 character chunks without using re
|
|
chunks = [text[i : i + 500] for i in range(0, len(text), 500)]
|
|
# Create embeddings for each chunk
|
|
for chunk in chunks:
|
|
# Create an embedding for the chunk
|
|
embedding = await self.model.send_embedding_request(chunk)
|
|
# Upsert the embedding for the conversation ID
|
|
self.pinecone_service.upsert_conversation_embedding(
|
|
self.model, conversation_id, chunk, 0
|
|
)
|
|
print("Finished creating embeddings for the text")
|
|
|
|
# Now that we have all the embeddings for the search, we can embed the query and then
|
|
# query pinecone for the top 5 results
|
|
query_embedding = await self.model.send_embedding_request(query)
|
|
results = self.pinecone_service.get_n_similar(
|
|
conversation_id, query_embedding, n=3
|
|
)
|
|
# Get only the first elements of each result
|
|
results = [result[0] for result in results]
|
|
|
|
# Construct a query for GPT3 to use these results to answer the query
|
|
GPT_QUERY = f"This is a search query. I want to know the answer to the query: {query}. Here are some results from the web: {[str(result) for result in results]}. \n\n Answer:"
|
|
# Generate the answer
|
|
# Use the tokenizer to determine token amount of the query
|
|
await self.model.send_request(
|
|
GPT_QUERY, UsageService.count_tokens_static(GPT_QUERY)
|
|
)
|
|
|
|
print(texts)
|