GPT3Discord/models/search_model.py

import asyncio
import os
import random
import re
import tempfile
import traceback
from functools import partial

import discord
from bs4 import BeautifulSoup
import aiohttp
from gpt_index import (
    QuestionAnswerPrompt,
    GPTSimpleVectorIndex,
    BeautifulSoupWebReader,
    Document,
    PromptHelper,
    LLMPredictor,
    OpenAIEmbedding,
    SimpleDirectoryReader,
)
from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR
from langchain import OpenAI

from services.environment_service import EnvService, app_root_path
from services.usage_service import UsageService


class Search:
    def __init__(self, gpt_model, usage_service):
        self.model = gpt_model
        self.usage_service = usage_service
        self.google_search_api_key = EnvService.get_google_search_api_key()
        self.google_search_engine_id = EnvService.get_google_search_engine_id()
        self.loop = asyncio.get_running_loop()
        self.qaprompt = QuestionAnswerPrompt(
            "You are formulating the response to a search query given the search prompt and the context. Context information is below. The text '<|endofstatement|>' is used to separate chat entries and make it easier for you to understand the context\n"
            "---------------------\n"
            "{context_str}"
            "\n---------------------\n"
            "Never say '<|endofstatement|>'\n"
            "Given the context information and not prior knowledge, "
            "answer the question, say that you were unable to answer the question if there is not sufficient context to formulate a decisive answer. The search query was: {query_str}\n"
        )
        self.openai_key = os.getenv("OPENAI_TOKEN")
        self.EMBED_CUTOFF = 2000

    def index_webpage(self, url) -> list[Document]:
        documents = BeautifulSoupWebReader(
            website_extractor=DEFAULT_WEBSITE_EXTRACTOR
        ).load_data(urls=[url])
        return documents

    async def index_pdf(self, url) -> list[Document]:
        # Download the PDF at the url and save it to a tempfile
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                if response.status == 200:
                    data = await response.read()
                    f = tempfile.NamedTemporaryFile(delete=False)
                    f.write(data)
                    f.close()
                else:
                    return "An error occurred while downloading the PDF."
        # Get the file path of this tempfile.NamedTemporaryFile
        # Save this temp file to an actual file that we can put into something else to read it
        documents = SimpleDirectoryReader(input_files=[f.name]).load_data()
        print("Loaded the PDF document data")

        # Delete the temporary file
        return documents

    async def get_links(self, query, search_scope=2):
        """Search the web for a query"""
        async with aiohttp.ClientSession() as session:
            async with session.get(
                f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"
            ) as response:
                if response.status == 200:
                    data = await response.json()
                    # Return a list of the top 2 links
                    return (
                        [item["link"] for item in data["items"][:search_scope]],
                        [item["link"] for item in data["items"]],
                    )
                else:
                    print(
                        "The Google Search API returned an error: "
                        + str(response.status)
                    )
                    return ["An error occurred while searching.", None]

    async def search(self, query, user_api_key, search_scope, nodes):
        DEFAULT_SEARCH_NODES = 1
        if not user_api_key:
            os.environ["OPENAI_API_KEY"] = self.openai_key
        else:
            os.environ["OPENAI_API_KEY"] = user_api_key

        llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003"))
        try:
            llm_predictor_presearch = OpenAI(
                max_tokens=30, temperature=0, model_name="text-davinci-003"
            )

            # Refine a query to send to google custom search API
            query_refined = llm_predictor_presearch.generate(
                prompts=[
                    "You are refining a query to send to the Google Custom Search API. Change the query such that putting it into the Google Custom Search API will return the most relevant websites to assist us in answering the original query. Respond with only the refined query for the original query. The original query is: "
                    + query
                    + "\nRefined Query:"
                ]
            )
            query_refined_text = query_refined.generations[0][0].text
        except Exception as e:
            traceback.print_exc()
            query_refined_text = query

        # Get the links for the query
        links, all_links = await self.get_links(query, search_scope=search_scope)
        if all_links is None:
            raise ValueError("The Google Search API returned an error.")

        # For each link, crawl the page and get all the text that's not HTML garbage.
        # Concatenate all the text for a given website into one string and save it into an array:
        documents = []
        for link in links:
            # First, attempt a connection with a timeout of 3 seconds to the link, if the timeout occurs, don't
            # continue to the document loading.
            pdf = False
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.get(link, timeout=2) as response:
                        # Add another entry to links from all_links if the link is not already in it to compensate for the failed request
                        if response.status not in [200, 203, 202, 204]:
                            for link2 in all_links:
                                if link2 not in links:
                                    print("Found a replacement link")
                                    links.append(link2)
                                    break
                            continue
                        # Follow redirects
                        elif response.status in [301, 302, 303, 307, 308]:
                            try:
                                print("Adding redirect")
                                links.append(response.url)
                                continue
                            except:
                                continue
                        else:
                            # Detect if the link is a PDF, if it is, we load it differently
                            if response.headers["Content-Type"] == "application/pdf":
                                print("Found a PDF at the link " + link)
                                pdf = True

            except:
                traceback.print_exc()
                try:
                    # Try to add a link from all_links, this is kind of messy.
                    for link2 in all_links:
                        if link2 not in links:
                            print("Found a replacement link")
                            links.append(link2)
                            break
                except:
                    pass

                continue

            try:
                if not pdf:
                    document = await self.loop.run_in_executor(
                        None, partial(self.index_webpage, link)
                    )
                else:
                    document = await self.index_pdf(link)
                [documents.append(doc) for doc in document]
            except Exception as e:
                traceback.print_exc()

        embedding_model = OpenAIEmbedding()

        index = await self.loop.run_in_executor(
            None, partial(GPTSimpleVectorIndex, documents, embed_model=embedding_model)
        )

        await self.usage_service.update_usage(
            embedding_model.last_token_usage, embeddings=True
        )

        llm_predictor = LLMPredictor(
            llm=OpenAI(model_name="text-davinci-003", max_tokens=-1)
        )

        # Now we can search the index for a query:
        embedding_model.last_token_usage = 0

        response = await self.loop.run_in_executor(
            None,
            partial(
                index.query,
                query,
                verbose=True,
                embed_model=embedding_model,
                llm_predictor=llm_predictor,
                similarity_top_k=nodes or DEFAULT_SEARCH_NODES,
                text_qa_template=self.qaprompt,
            ),
        )

        await self.usage_service.update_usage(llm_predictor.last_token_usage)
        await self.usage_service.update_usage(
            embedding_model.last_token_usage, embeddings=True
        )

        return response
enable /search alpha 2 years ago			`import asyncio`
			`import os`
Add Paraphrase 2 years ago			`import random`
			`import re`
Pagination for gpt responses, max tokens 4096 2 years ago			`import tempfile`
a bit of error catching for search 2 years ago			`import traceback`
enable /search alpha 2 years ago			`from functools import partial`

Pagination for gpt responses, max tokens 4096 2 years ago			`import discord`
Add Paraphrase 2 years ago			`from bs4 import BeautifulSoup`
			`import aiohttp`
Format Python code with psf/black push 2 years ago			`from gpt_index import (`
			`QuestionAnswerPrompt,`
			`GPTSimpleVectorIndex,`
			`BeautifulSoupWebReader,`
Format Python code with psf/black push 2 years ago			`Document,`
			`PromptHelper,`
			`LLMPredictor,`
			`OpenAIEmbedding,`
Format Python code with psf/black push 2 years ago			`SimpleDirectoryReader,`
Format Python code with psf/black push 2 years ago			`)`
enable /search alpha 2 years ago			`from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR`
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`from langchain import OpenAI`
Add Paraphrase 2 years ago
Pagination for gpt responses, max tokens 4096 2 years ago			`from services.environment_service import EnvService, app_root_path`
Add Paraphrase 2 years ago			`from services.usage_service import UsageService`


			`class Search:`
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`def __init__(self, gpt_model, usage_service):`
Add Paraphrase 2 years ago			`self.model = gpt_model`
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`self.usage_service = usage_service`
Add Paraphrase 2 years ago			`self.google_search_api_key = EnvService.get_google_search_api_key()`
			`self.google_search_engine_id = EnvService.get_google_search_engine_id()`
enable /search alpha 2 years ago			`self.loop = asyncio.get_running_loop()`
			`self.qaprompt = QuestionAnswerPrompt(`
			`"You are formulating the response to a search query given the search prompt and the context. Context information is below. The text '<\|endofstatement\|>' is used to separate chat entries and make it easier for you to understand the context\n"`
			`"---------------------\n"`
			`"{context_str}"`
			`"\n---------------------\n"`
			`"Never say '<\|endofstatement\|>'\n"`
			`"Given the context information and not prior knowledge, "`
			`"answer the question, say that you were unable to answer the question if there is not sufficient context to formulate a decisive answer. The search query was: {query_str}\n"`
			`)`
			`self.openai_key = os.getenv("OPENAI_TOKEN")`
Pagination for gpt responses, max tokens 4096 2 years ago			`self.EMBED_CUTOFF = 2000`
Format Python code with psf/black push 2 years ago
enable /search alpha 2 years ago			`def index_webpage(self, url) -> list[Document]:`
			`documents = BeautifulSoupWebReader(`
			`website_extractor=DEFAULT_WEBSITE_EXTRACTOR`
			`).load_data(urls=[url])`
			`return documents`
Add Paraphrase 2 years ago
Pagination for gpt responses, max tokens 4096 2 years ago			`async def index_pdf(self, url) -> list[Document]:`
			`# Download the PDF at the url and save it to a tempfile`
			`async with aiohttp.ClientSession() as session:`
			`async with session.get(url) as response:`
			`if response.status == 200:`
			`data = await response.read()`
			`f = tempfile.NamedTemporaryFile(delete=False)`
			`f.write(data)`
			`f.close()`
			`else:`
			`return "An error occurred while downloading the PDF."`
			`# Get the file path of this tempfile.NamedTemporaryFile`
			`# Save this temp file to an actual file that we can put into something else to read it`
			`documents = SimpleDirectoryReader(input_files=[f.name]).load_data()`
			`print("Loaded the PDF document data")`

			`# Delete the temporary file`
			`return documents`

minor fixes 2 years ago			`async def get_links(self, query, search_scope=2):`
Add Paraphrase 2 years ago			`"""Search the web for a query"""`
			`async with aiohttp.ClientSession() as session:`
			`async with session.get(`
			`f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"`
			`) as response:`
			`if response.status == 200:`
			`data = await response.json()`
minor fixes 2 years ago			`# Return a list of the top 2 links`
Format Python code with psf/black push 2 years ago			`return (`
			`[item["link"] for item in data["items"][:search_scope]],`
			`[item["link"] for item in data["items"]],`
			`)`
Add Paraphrase 2 years ago			`else:`
Format Python code with psf/black push 2 years ago			`print(`
			`"The Google Search API returned an error: "`
			`+ str(response.status)`
			`)`
Pagination for gpt responses, max tokens 4096 2 years ago			`return ["An error occurred while searching.", None]`
Add Paraphrase 2 years ago
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`async def search(self, query, user_api_key, search_scope, nodes):`
default back to 1 search node 2 years ago			`DEFAULT_SEARCH_NODES = 1`
enable /search alpha 2 years ago			`if not user_api_key:`
			`os.environ["OPENAI_API_KEY"] = self.openai_key`
			`else:`
			`os.environ["OPENAI_API_KEY"] = user_api_key`

search pre-processor for query refining for google custom search 2 years ago			`llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003"))`
			`try:`
Format Python code with psf/black push 2 years ago			`llm_predictor_presearch = OpenAI(`
			`max_tokens=30, temperature=0, model_name="text-davinci-003"`
			`)`
search pre-processor for query refining for google custom search 2 years ago
			`# Refine a query to send to google custom search API`
Format Python code with psf/black push 2 years ago			`query_refined = llm_predictor_presearch.generate(`
			`prompts=[`
			`"You are refining a query to send to the Google Custom Search API. Change the query such that putting it into the Google Custom Search API will return the most relevant websites to assist us in answering the original query. Respond with only the refined query for the original query. The original query is: "`
			`+ query`
			`+ "\nRefined Query:"`
			`]`
			`)`
search pre-processor for query refining for google custom search 2 years ago			`query_refined_text = query_refined.generations[0][0].text`
			`except Exception as e:`
			`traceback.print_exc()`
			`query_refined_text = query`

Add Paraphrase 2 years ago			`# Get the links for the query`
search improvements 2 years ago			`links, all_links = await self.get_links(query, search_scope=search_scope)`
Pagination for gpt responses, max tokens 4096 2 years ago			`if all_links is None:`
			`raise ValueError("The Google Search API returned an error.")`
Add Paraphrase 2 years ago
			`# For each link, crawl the page and get all the text that's not HTML garbage.`
			`# Concatenate all the text for a given website into one string and save it into an array:`
enable /search alpha 2 years ago			`documents = []`
Add Paraphrase 2 years ago			`for link in links:`
bugfixes 2 years ago			`# First, attempt a connection with a timeout of 3 seconds to the link, if the timeout occurs, don't`
			`# continue to the document loading.`
Pagination for gpt responses, max tokens 4096 2 years ago			`pdf = False`
bugfixes 2 years ago			`try:`
			`async with aiohttp.ClientSession() as session:`
search improvements 2 years ago			`async with session.get(link, timeout=2) as response:`
			`# Add another entry to links from all_links if the link is not already in it to compensate for the failed request`
			`if response.status not in [200, 203, 202, 204]:`
			`for link2 in all_links:`
			`if link2 not in links:`
			`print("Found a replacement link")`
			`links.append(link2)`
			`break`
			`continue`
			`# Follow redirects`
			`elif response.status in [301, 302, 303, 307, 308]:`
			`try:`
			`print("Adding redirect")`
			`links.append(response.url)`
Pagination for gpt responses, max tokens 4096 2 years ago			`continue`
search improvements 2 years ago			`except:`
Pagination for gpt responses, max tokens 4096 2 years ago			`continue`
			`else:`
			`# Detect if the link is a PDF, if it is, we load it differently`
			`if response.headers["Content-Type"] == "application/pdf":`
			`print("Found a PDF at the link " + link)`
			`pdf = True`
search improvements 2 years ago
bugfixes 2 years ago			`except:`
			`traceback.print_exc()`
search improvements 2 years ago			`try:`
			`# Try to add a link from all_links, this is kind of messy.`
			`for link2 in all_links:`
			`if link2 not in links:`
			`print("Found a replacement link")`
			`links.append(link2)`
			`break`
			`except:`
			`pass`

bugfixes 2 years ago			`continue`

a bit of error catching for search 2 years ago			`try:`
Pagination for gpt responses, max tokens 4096 2 years ago			`if not pdf:`
			`document = await self.loop.run_in_executor(`
			`None, partial(self.index_webpage, link)`
			`)`
			`else:`
			`document = await self.index_pdf(link)`
a bit of error catching for search 2 years ago			`[documents.append(doc) for doc in document]`
			`except Exception as e:`
			`traceback.print_exc()`

forgot embeddings in one place 2 years ago			`embedding_model = OpenAIEmbedding()`
minor fixes 2 years ago
Format Python code with psf/black push 2 years ago			`index = await self.loop.run_in_executor(`
			`None, partial(GPTSimpleVectorIndex, documents, embed_model=embedding_model)`
			`)`
minor fixes 2 years ago
Format Python code with psf/black push 2 years ago			`await self.usage_service.update_usage(`
			`embedding_model.last_token_usage, embeddings=True`
			`)`
Add Paraphrase 2 years ago
Format Python code with psf/black push 2 years ago			`llm_predictor = LLMPredictor(`
			`llm=OpenAI(model_name="text-davinci-003", max_tokens=-1)`
			`)`
Add Paraphrase 2 years ago
enable /search alpha 2 years ago			`# Now we can search the index for a query:`
account for different token types 2 years ago			`embedding_model.last_token_usage = 0`
minor fixes 2 years ago
Format Python code with psf/black push 2 years ago			`response = await self.loop.run_in_executor(`
			`None,`
			`partial(`
			`index.query,`
			`query,`
			`verbose=True,`
			`embed_model=embedding_model,`
			`llm_predictor=llm_predictor,`
			`similarity_top_k=nodes or DEFAULT_SEARCH_NODES,`
			`text_qa_template=self.qaprompt,`
			`),`
Format Python code with psf/black push 2 years ago			`)`
minor fixes 2 years ago
Format Python code with psf/black push 2 years ago			`await self.usage_service.update_usage(llm_predictor.last_token_usage)`
account for different token types 2 years ago			`await self.usage_service.update_usage(`
			`embedding_model.last_token_usage, embeddings=True`
Format Python code with psf/black push 2 years ago			`)`
Add Paraphrase 2 years ago
enable /search alpha 2 years ago			`return response`