parent
4181b2d1a1
commit
11f563e16a
@ -0,0 +1,37 @@
|
||||
import traceback
|
||||
|
||||
import aiohttp
|
||||
import discord
|
||||
|
||||
from models.deepl_model import TranslationModel
|
||||
from models.search_model import Search
|
||||
from services.environment_service import EnvService
|
||||
|
||||
|
||||
ALLOWED_GUILDS = EnvService.get_allowed_guilds()
|
||||
|
||||
class SearchService(discord.Cog, name="SearchService"):
|
||||
"""Cog containing translation commands and retrieval of translation services"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bot,
|
||||
gpt_model,
|
||||
pinecone_service,
|
||||
):
|
||||
super().__init__()
|
||||
self.bot = bot
|
||||
self.model = Search(gpt_model, pinecone_service)
|
||||
# Make a mapping of all the country codes and their full country names:
|
||||
|
||||
|
||||
async def search_command(self, ctx, query):
|
||||
"""Command handler for the translation command"""
|
||||
await ctx.defer()
|
||||
await self.model.search(query)
|
||||
await ctx.respond("ok")
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,77 @@
|
||||
import random
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
import aiohttp
|
||||
|
||||
from services.environment_service import EnvService
|
||||
from services.usage_service import UsageService
|
||||
|
||||
|
||||
class Search:
|
||||
|
||||
def __init__(self, gpt_model, pinecone_service):
|
||||
self.model = gpt_model
|
||||
self.pinecone_service = pinecone_service
|
||||
self.google_search_api_key = EnvService.get_google_search_api_key()
|
||||
self.google_search_engine_id = EnvService.get_google_search_engine_id()
|
||||
|
||||
async def get_links(self, query):
|
||||
"""Search the web for a query"""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
# Return a list of the top 5 links
|
||||
return [item["link"] for item in data["items"][:5]]
|
||||
else:
|
||||
return "An error occurred while searching."
|
||||
|
||||
async def search(self, query):
|
||||
# Get the links for the query
|
||||
links = await self.get_links(query)
|
||||
|
||||
# For each link, crawl the page and get all the text that's not HTML garbage.
|
||||
# Concatenate all the text for a given website into one string and save it into an array:
|
||||
texts = []
|
||||
for link in links:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(link, timeout=5) as response:
|
||||
if response.status == 200:
|
||||
soup = BeautifulSoup(await response.read(), "html.parser")
|
||||
# Find all the content between <p> tags and join them together and then append to texts
|
||||
texts.append(" ".join([p.text for p in soup.find_all("p")]))
|
||||
else:
|
||||
pass
|
||||
print("Finished retrieving text content from the links")
|
||||
|
||||
# For each text in texts, split it up into 500 character chunks and create embeddings for it
|
||||
# The pinecone service uses conversation_id, but we can use it here too to keep track of the "search", each
|
||||
# conversation_id represents a unique search.
|
||||
conversation_id = random.randint(0, 100000000)
|
||||
for text in texts:
|
||||
# Split the text into 150 character chunks without using re
|
||||
chunks = [text[i : i + 500] for i in range(0, len(text), 500)]
|
||||
# Create embeddings for each chunk
|
||||
for chunk in chunks:
|
||||
# Create an embedding for the chunk
|
||||
embedding = await self.model.send_embedding_request(chunk)
|
||||
# Upsert the embedding for the conversation ID
|
||||
self.pinecone_service.upsert_conversation_embedding(self.model, conversation_id, chunk,0)
|
||||
print("Finished creating embeddings for the text")
|
||||
|
||||
# Now that we have all the embeddings for the search, we can embed the query and then
|
||||
# query pinecone for the top 5 results
|
||||
query_embedding = await self.model.send_embedding_request(query)
|
||||
results = self.pinecone_service.get_n_similar(conversation_id, query_embedding, n=3)
|
||||
# Get only the first elements of each result
|
||||
results = [result[0] for result in results]
|
||||
|
||||
# Construct a query for GPT3 to use these results to answer the query
|
||||
GPT_QUERY = f"This is a search query. I want to know the answer to the query: {query}. Here are some results from the web: {[str(result) for result in results]}. \n\n Answer:"
|
||||
# Generate the answer
|
||||
# Use the tokenizer to determine token amount of the query
|
||||
await self.model.send_request(GPT_QUERY, UsageService.count_tokens_static(GPT_QUERY))
|
||||
|
||||
print(texts)
|
Loading…
Reference in new issue