GPT3Discord/models/search_model.py

import asyncio
import os
import random
import re
import tempfile
import traceback
from functools import partial

import discord
from bs4 import BeautifulSoup
import aiohttp
from gpt_index import (
    QuestionAnswerPrompt,
    GPTSimpleVectorIndex,
    BeautifulSoupWebReader,
    Document,
    PromptHelper,
    LLMPredictor,
    OpenAIEmbedding, SimpleDirectoryReader,
)
from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR
from langchain import OpenAI

from services.environment_service import EnvService, app_root_path
from services.usage_service import UsageService


class Search:
    def __init__(self, gpt_model, usage_service):
        self.model = gpt_model
        self.usage_service = usage_service
        self.google_search_api_key = EnvService.get_google_search_api_key()
        self.google_search_engine_id = EnvService.get_google_search_engine_id()
        self.loop = asyncio.get_running_loop()
        self.qaprompt = QuestionAnswerPrompt(
            "You are formulating the response to a search query given the search prompt and the context. Context information is below. The text '<|endofstatement|>' is used to separate chat entries and make it easier for you to understand the context\n"
            "---------------------\n"
            "{context_str}"
            "\n---------------------\n"
            "Never say '<|endofstatement|>'\n"
            "Given the context information and not prior knowledge, "
            "answer the question, say that you were unable to answer the question if there is not sufficient context to formulate a decisive answer. The search query was: {query_str}\n"
        )
        self.openai_key = os.getenv("OPENAI_TOKEN")
        self.EMBED_CUTOFF = 2000

    def index_webpage(self, url) -> list[Document]:
        documents = BeautifulSoupWebReader(
            website_extractor=DEFAULT_WEBSITE_EXTRACTOR
        ).load_data(urls=[url])
        return documents


    async def index_pdf(self, url) -> list[Document]:
        # Download the PDF at the url and save it to a tempfile
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                if response.status == 200:
                    data = await response.read()
                    f = tempfile.NamedTemporaryFile(delete=False)
                    f.write(data)
                    f.close()
                else:
                    return "An error occurred while downloading the PDF."
        # Get the file path of this tempfile.NamedTemporaryFile
        # Save this temp file to an actual file that we can put into something else to read it
        documents = SimpleDirectoryReader(input_files=[f.name]).load_data()
        print("Loaded the PDF document data")

        # Delete the temporary file
        return documents

    async def get_links(self, query, search_scope=2):
        """Search the web for a query"""
        async with aiohttp.ClientSession() as session:
            async with session.get(
                f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"
            ) as response:
                if response.status == 200:
                    data = await response.json()
                    # Return a list of the top 2 links
                    return ([item["link"] for item in data["items"][:search_scope]], [
                        item["link"] for item in data["items"]
                    ])
                else:
                    print("The Google Search API returned an error: " + str(response.status))
                    return ["An error occurred while searching.", None]

    async def search(self, query, user_api_key, search_scope, nodes):
        DEFAULT_SEARCH_NODES = 1
        if not user_api_key:
            os.environ["OPENAI_API_KEY"] = self.openai_key
        else:
            os.environ["OPENAI_API_KEY"] = user_api_key

        # Get the links for the query
        links, all_links = await self.get_links(query, search_scope=search_scope)
        if all_links is None:
            raise ValueError("The Google Search API returned an error.")

        # For each link, crawl the page and get all the text that's not HTML garbage.
        # Concatenate all the text for a given website into one string and save it into an array:
        documents = []
        for link in links:
            # First, attempt a connection with a timeout of 3 seconds to the link, if the timeout occurs, don't
            # continue to the document loading.
            pdf = False
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.get(link, timeout=2) as response:
                        # Add another entry to links from all_links if the link is not already in it to compensate for the failed request
                        if response.status not in [200, 203, 202, 204]:
                            for link2 in all_links:
                                if link2 not in links:
                                    print("Found a replacement link")
                                    links.append(link2)
                                    break
                            continue
                        # Follow redirects
                        elif response.status in [301, 302, 303, 307, 308]:
                            try:
                                print("Adding redirect")
                                links.append(response.url)
                                continue
                            except:
                                continue
                        else:
                            # Detect if the link is a PDF, if it is, we load it differently
                            if response.headers["Content-Type"] == "application/pdf":
                                print("Found a PDF at the link " + link)
                                pdf = True

            except:
                traceback.print_exc()
                try:
                    # Try to add a link from all_links, this is kind of messy.
                    for link2 in all_links:
                        if link2 not in links:
                            print("Found a replacement link")
                            links.append(link2)
                            break
                except:
                    pass

                continue

            try:
                if not pdf:
                    document = await self.loop.run_in_executor(
                        None, partial(self.index_webpage, link)
                    )
                else:
                    document = await self.index_pdf(link)
                [documents.append(doc) for doc in document]
            except Exception as e:
                traceback.print_exc()

        embedding_model = OpenAIEmbedding()

        index = await self.loop.run_in_executor(None, partial(GPTSimpleVectorIndex, documents, embed_model=embedding_model))

        await self.usage_service.update_usage(
            embedding_model.last_token_usage, embeddings=True
        )

        llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003", max_tokens=-1))
        # Now we can search the index for a query:
        embedding_model.last_token_usage = 0

        response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, embed_model=embedding_model, llm_predictor=llm_predictor, similarity_top_k=nodes or DEFAULT_SEARCH_NODES, text_qa_template=self.qaprompt))

        await self.usage_service.update_usage(llm_predictor.last_token_usage)
        await self.usage_service.update_usage(
            embedding_model.last_token_usage, embeddings=True
        )

        return response
enable /search alpha 2 years ago			`import asyncio`
			`import os`
Add Paraphrase 2 years ago			`import random`
			`import re`
Pagination for gpt responses, max tokens 4096 2 years ago			`import tempfile`
a bit of error catching for search 2 years ago			`import traceback`
enable /search alpha 2 years ago			`from functools import partial`

Pagination for gpt responses, max tokens 4096 2 years ago			`import discord`
Add Paraphrase 2 years ago			`from bs4 import BeautifulSoup`
			`import aiohttp`
Format Python code with psf/black push 2 years ago			`from gpt_index import (`
			`QuestionAnswerPrompt,`
			`GPTSimpleVectorIndex,`
			`BeautifulSoupWebReader,`
Format Python code with psf/black push 2 years ago			`Document,`
			`PromptHelper,`
			`LLMPredictor,`
Pagination for gpt responses, max tokens 4096 2 years ago			`OpenAIEmbedding, SimpleDirectoryReader,`
Format Python code with psf/black push 2 years ago			`)`
enable /search alpha 2 years ago			`from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR`
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`from langchain import OpenAI`
Add Paraphrase 2 years ago
Pagination for gpt responses, max tokens 4096 2 years ago			`from services.environment_service import EnvService, app_root_path`
Add Paraphrase 2 years ago			`from services.usage_service import UsageService`


			`class Search:`
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`def __init__(self, gpt_model, usage_service):`
Add Paraphrase 2 years ago			`self.model = gpt_model`
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`self.usage_service = usage_service`
Add Paraphrase 2 years ago			`self.google_search_api_key = EnvService.get_google_search_api_key()`
			`self.google_search_engine_id = EnvService.get_google_search_engine_id()`
enable /search alpha 2 years ago			`self.loop = asyncio.get_running_loop()`
			`self.qaprompt = QuestionAnswerPrompt(`
			`"You are formulating the response to a search query given the search prompt and the context. Context information is below. The text '<\|endofstatement\|>' is used to separate chat entries and make it easier for you to understand the context\n"`
			`"---------------------\n"`
			`"{context_str}"`
			`"\n---------------------\n"`
			`"Never say '<\|endofstatement\|>'\n"`
			`"Given the context information and not prior knowledge, "`
			`"answer the question, say that you were unable to answer the question if there is not sufficient context to formulate a decisive answer. The search query was: {query_str}\n"`
			`)`
			`self.openai_key = os.getenv("OPENAI_TOKEN")`
Pagination for gpt responses, max tokens 4096 2 years ago			`self.EMBED_CUTOFF = 2000`
Format Python code with psf/black push 2 years ago
enable /search alpha 2 years ago			`def index_webpage(self, url) -> list[Document]:`
			`documents = BeautifulSoupWebReader(`
			`website_extractor=DEFAULT_WEBSITE_EXTRACTOR`
			`).load_data(urls=[url])`
			`return documents`
Add Paraphrase 2 years ago
Pagination for gpt responses, max tokens 4096 2 years ago
			`async def index_pdf(self, url) -> list[Document]:`
			`# Download the PDF at the url and save it to a tempfile`
			`async with aiohttp.ClientSession() as session:`
			`async with session.get(url) as response:`
			`if response.status == 200:`
			`data = await response.read()`
			`f = tempfile.NamedTemporaryFile(delete=False)`
			`f.write(data)`
			`f.close()`
			`else:`
			`return "An error occurred while downloading the PDF."`
			`# Get the file path of this tempfile.NamedTemporaryFile`
			`# Save this temp file to an actual file that we can put into something else to read it`
			`documents = SimpleDirectoryReader(input_files=[f.name]).load_data()`
			`print("Loaded the PDF document data")`

			`# Delete the temporary file`
			`return documents`

minor fixes 2 years ago			`async def get_links(self, query, search_scope=2):`
Add Paraphrase 2 years ago			`"""Search the web for a query"""`
			`async with aiohttp.ClientSession() as session:`
			`async with session.get(`
			`f"https://www.googleapis.com/customsearch/v1?key={self.google_search_api_key}&cx={self.google_search_engine_id}&q={query}"`
			`) as response:`
			`if response.status == 200:`
			`data = await response.json()`
minor fixes 2 years ago			`# Return a list of the top 2 links`
Pagination for gpt responses, max tokens 4096 2 years ago			`return ([item["link"] for item in data["items"][:search_scope]], [`
Format Python code with psf/black push 2 years ago			`item["link"] for item in data["items"]`
Pagination for gpt responses, max tokens 4096 2 years ago			`])`
Add Paraphrase 2 years ago			`else:`
Pagination for gpt responses, max tokens 4096 2 years ago			`print("The Google Search API returned an error: " + str(response.status))`
			`return ["An error occurred while searching.", None]`
Add Paraphrase 2 years ago
Full usage metrics reporting, nodes param for search, default 2 for search 2 years ago			`async def search(self, query, user_api_key, search_scope, nodes):`
default back to 1 search node 2 years ago			`DEFAULT_SEARCH_NODES = 1`
enable /search alpha 2 years ago			`if not user_api_key:`
			`os.environ["OPENAI_API_KEY"] = self.openai_key`
			`else:`
			`os.environ["OPENAI_API_KEY"] = user_api_key`

Add Paraphrase 2 years ago			`# Get the links for the query`
search improvements 2 years ago			`links, all_links = await self.get_links(query, search_scope=search_scope)`
Pagination for gpt responses, max tokens 4096 2 years ago			`if all_links is None:`
			`raise ValueError("The Google Search API returned an error.")`
Add Paraphrase 2 years ago
			`# For each link, crawl the page and get all the text that's not HTML garbage.`
			`# Concatenate all the text for a given website into one string and save it into an array:`
enable /search alpha 2 years ago			`documents = []`
Add Paraphrase 2 years ago			`for link in links:`
bugfixes 2 years ago			`# First, attempt a connection with a timeout of 3 seconds to the link, if the timeout occurs, don't`
			`# continue to the document loading.`
Pagination for gpt responses, max tokens 4096 2 years ago			`pdf = False`
bugfixes 2 years ago			`try:`
			`async with aiohttp.ClientSession() as session:`
search improvements 2 years ago			`async with session.get(link, timeout=2) as response:`
			`# Add another entry to links from all_links if the link is not already in it to compensate for the failed request`
			`if response.status not in [200, 203, 202, 204]:`
			`for link2 in all_links:`
			`if link2 not in links:`
			`print("Found a replacement link")`
			`links.append(link2)`
			`break`
			`continue`
			`# Follow redirects`
			`elif response.status in [301, 302, 303, 307, 308]:`
			`try:`
			`print("Adding redirect")`
			`links.append(response.url)`
Pagination for gpt responses, max tokens 4096 2 years ago			`continue`
search improvements 2 years ago			`except:`
Pagination for gpt responses, max tokens 4096 2 years ago			`continue`
			`else:`
			`# Detect if the link is a PDF, if it is, we load it differently`
			`if response.headers["Content-Type"] == "application/pdf":`
			`print("Found a PDF at the link " + link)`
			`pdf = True`
search improvements 2 years ago
bugfixes 2 years ago			`except:`
			`traceback.print_exc()`
search improvements 2 years ago			`try:`
			`# Try to add a link from all_links, this is kind of messy.`
			`for link2 in all_links:`
			`if link2 not in links:`
			`print("Found a replacement link")`
			`links.append(link2)`
			`break`
			`except:`
			`pass`

bugfixes 2 years ago			`continue`

a bit of error catching for search 2 years ago			`try:`
Pagination for gpt responses, max tokens 4096 2 years ago			`if not pdf:`
			`document = await self.loop.run_in_executor(`
			`None, partial(self.index_webpage, link)`
			`)`
			`else:`
			`document = await self.index_pdf(link)`
a bit of error catching for search 2 years ago			`[documents.append(doc) for doc in document]`
			`except Exception as e:`
			`traceback.print_exc()`

forgot embeddings in one place 2 years ago			`embedding_model = OpenAIEmbedding()`
minor fixes 2 years ago
			`index = await self.loop.run_in_executor(None, partial(GPTSimpleVectorIndex, documents, embed_model=embedding_model))`

Format Python code with psf/black push 2 years ago			`await self.usage_service.update_usage(`
			`embedding_model.last_token_usage, embeddings=True`
			`)`
Add Paraphrase 2 years ago
Pagination for gpt responses, max tokens 4096 2 years ago			`llm_predictor = LLMPredictor(llm=OpenAI(model_name="text-davinci-003", max_tokens=-1))`
enable /search alpha 2 years ago			`# Now we can search the index for a query:`
account for different token types 2 years ago			`embedding_model.last_token_usage = 0`
minor fixes 2 years ago
			`response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, embed_model=embedding_model, llm_predictor=llm_predictor, similarity_top_k=nodes or DEFAULT_SEARCH_NODES, text_qa_template=self.qaprompt))`

Format Python code with psf/black push 2 years ago			`await self.usage_service.update_usage(llm_predictor.last_token_usage)`
account for different token types 2 years ago			`await self.usage_service.update_usage(`
			`embedding_model.last_token_usage, embeddings=True`
Format Python code with psf/black push 2 years ago			`)`
Add Paraphrase 2 years ago
enable /search alpha 2 years ago			`return response`