From 9052bc2d8095f11b2b17ce50b00015243d7b4888 Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Sat, 28 Jan 2023 23:34:42 +0000 Subject: [PATCH 01/23] Added gpt-index commands --- cogs/commands.py | 37 ++++++++++++++++++++++++++ cogs/index_service_cog.py | 42 ++++++++++++++++++++++++++++++ gpt3discord.py | 8 ++++++ models/index_model.py | 55 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + requirements.txt | 3 ++- 6 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 cogs/index_service_cog.py create mode 100644 models/index_model.py diff --git a/cogs/commands.py b/cogs/commands.py index ef48595..241a7b2 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -26,6 +26,7 @@ class Commands(discord.Cog, name="Commands"): image_draw_cog, image_service_cog, moderations_cog, + index_cog, translations_cog=None, search_cog=None, ): @@ -39,6 +40,7 @@ class Commands(discord.Cog, name="Commands"): self.image_draw_cog = image_draw_cog self.image_service_cog = image_service_cog self.moderations_cog = moderations_cog + self.index_cog = index_cog self.translations_cog = translations_cog self.search_cog = search_cog @@ -67,6 +69,12 @@ class Commands(discord.Cog, name="Commands"): guild_ids=ALLOWED_GUILDS, checks=[Check.check_admin_roles()], ) + index = discord.SlashCommandGroup( + name="index", + description="gpt-index commands", + guild_ids=ALLOWED_GUILDS, + checks=[Check.check_gpt_roles()], + ) # # System commands @@ -489,6 +497,35 @@ class Commands(discord.Cog, name="Commands"): async def end(self, ctx: discord.ApplicationContext): await self.converser_cog.end_command(ctx) + # + # Index commands + # + + @add_to_group("index") + @discord.slash_command( + name="set", + description="Set an index to query from", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="file", description="A file to create the index from", required=True, input_type=discord.Attachment) + async def set(self, ctx:discord.ApplicationContext, file: discord.Attachment): + await self.index_cog.set_index_command(ctx, file) + + + @add_to_group("index") + @discord.slash_command( + name="query", + description="Query from your index", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="query", description="What to query the index", required=True) + async def query(self, ctx:discord.ApplicationContext, query: str): + await self.index_cog.query_command(ctx, query) + + + # # DALLE commands # diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py new file mode 100644 index 0000000..f3dada1 --- /dev/null +++ b/cogs/index_service_cog.py @@ -0,0 +1,42 @@ +import discord + +from services.environment_service import EnvService +from services.text_service import TextService +from models.index_model import Index_handler + +USER_INPUT_API_KEYS = EnvService.get_user_input_api_keys() +USER_KEY_DB = EnvService.get_api_db() + +class IndexService(discord.Cog, name="IndexService"): + """Cog containing gpt-index commands""" + def __init__( + self, + bot, + ): + super().__init__() + self.bot = bot + self.index_handler = Index_handler() + + async def set_index_command(self, ctx, file: discord.Attachment): + """Command handler to set a file as your personal index""" + + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + await self.index_handler.set_index(ctx, file, user_api_key=user_api_key) + + + async def query_command(self, ctx, query): + """Command handler to query your index""" + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer() + await self.index_handler.query(ctx, query, user_api_key=user_api_key) diff --git a/gpt3discord.py b/gpt3discord.py index 77356d9..38034a8 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -18,6 +18,7 @@ from cogs.prompt_optimizer_cog import ImgPromptOptimizer from cogs.moderations_service_cog import ModerationsService from cogs.commands import Commands from cogs.translation_service_cog import TranslationService +from cogs.index_service_cog import IndexService from models.deepl_model import TranslationModel from services.health_service import HealthService @@ -169,6 +170,12 @@ async def main(): ) ) + bot.add_cog( + IndexService( + bot + ) + ) + if EnvService.get_deepl_token(): bot.add_cog(TranslationService(bot, TranslationModel())) print("The translation service is enabled.") @@ -191,6 +198,7 @@ async def main(): bot.get_cog("DrawDallEService"), bot.get_cog("ImgPromptOptimizer"), bot.get_cog("ModerationsService"), + bot.get_cog("IndexService"), bot.get_cog("TranslationService"), bot.get_cog("SearchService"), ) diff --git a/models/index_model.py b/models/index_model.py new file mode 100644 index 0000000..5577e4a --- /dev/null +++ b/models/index_model.py @@ -0,0 +1,55 @@ +import os +import traceback +import asyncio +import tempfile +from functools import partial +import discord + +from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader + + + +class Index_handler: + def __init__(self): + self.openai_key = os.getenv("OPENAI_TOKEN") + self.index_storage = {} + + def index_file(self, file): + document = SimpleDirectoryReader(file).load_data() + index = GPTSimpleVectorIndex(document) + return index + + + async def set_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): + loop = asyncio.get_running_loop() + + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + temp_path = tempfile.TemporaryDirectory() + temp_file = tempfile.NamedTemporaryFile(suffix=".txt", dir=temp_path.name, delete=False) + await file.save(temp_file.name) + index = await loop.run_in_executor(None, partial(self.index_file, temp_path.name)) + self.index_storage[ctx.user.id] = index + temp_path.cleanup() + await ctx.respond("Index set") + except Exception: + await ctx.respond("Failed to set index") + traceback.print_exc() + + async def query(self, ctx: discord.ApplicationContext, query, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + if not self.index_storage[ctx.user.id]: + await ctx.respond("You need to set an index", ephemeral=True, delete_after=5) + return + + index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] + response = index.query(query, verbose=True) + await ctx.respond(f"Query response: {response}") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a85a823..9213da9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "flask", "flask", "beautifulsoup4", + "gpt-index", ] dynamic = ["version"] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 6a7339c..03aea68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ pinecone-client==2.1.0 sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.2 -beautifulsoup4==4.11.1 \ No newline at end of file +beautifulsoup4==4.11.1 +gpt-index==0.2.16 \ No newline at end of file From 0555cf8b332a154592892ce4892deb787ffed8a4 Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Sat, 28 Jan 2023 23:53:24 +0000 Subject: [PATCH 02/23] Some async changes --- models/index_model.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/models/index_model.py b/models/index_model.py index 5577e4a..e53909d 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -13,16 +13,15 @@ class Index_handler: def __init__(self): self.openai_key = os.getenv("OPENAI_TOKEN") self.index_storage = {} + self.loop = asyncio.get_running_loop() - def index_file(self, file): - document = SimpleDirectoryReader(file).load_data() + def index_file(self, file_path): + document = SimpleDirectoryReader(file_path).load_data() index = GPTSimpleVectorIndex(document) return index async def set_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): - loop = asyncio.get_running_loop() - if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key else: @@ -32,7 +31,7 @@ class Index_handler: temp_path = tempfile.TemporaryDirectory() temp_file = tempfile.NamedTemporaryFile(suffix=".txt", dir=temp_path.name, delete=False) await file.save(temp_file.name) - index = await loop.run_in_executor(None, partial(self.index_file, temp_path.name)) + index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path.name)) self.index_storage[ctx.user.id] = index temp_path.cleanup() await ctx.respond("Index set") @@ -51,5 +50,5 @@ class Index_handler: return index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] - response = index.query(query, verbose=True) + response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True)) await ctx.respond(f"Query response: {response}") \ No newline at end of file From 64ce627bb7e2c34c9e65c30b1c72014b11c58b66 Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Mon, 30 Jan 2023 20:03:05 +0000 Subject: [PATCH 03/23] Update filetypes --- models/index_model.py | 9 ++++++++- pyproject.toml | 1 + requirements.txt | 3 ++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/models/index_model.py b/models/index_model.py index e53909d..2d6d2af 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -29,7 +29,14 @@ class Index_handler: try: temp_path = tempfile.TemporaryDirectory() - temp_file = tempfile.NamedTemporaryFile(suffix=".txt", dir=temp_path.name, delete=False) + if file.content_type.startswith("text/plain"): + suffix = ".txt" + elif file.content_type.startswith("application/pdf"): + suffix = ".pdf" + else: + await ctx.respond("Only accepts txt or pdf files") + return + temp_file = tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path.name, delete=False) await file.save(temp_file.name) index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path.name)) self.index_storage[ctx.user.id] = index diff --git a/pyproject.toml b/pyproject.toml index 9213da9..a9a20b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "flask", "beautifulsoup4", "gpt-index", + "PyPDF2", ] dynamic = ["version"] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 03aea68..91c86b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.2 beautifulsoup4==4.11.1 -gpt-index==0.2.16 \ No newline at end of file +gpt-index==0.2.16 +PyPDF2==3.0.1 \ No newline at end of file From 37a20a7e37686205e8f9a0835c5ae1f2c966b0e6 Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Tue, 31 Jan 2023 13:01:35 +0000 Subject: [PATCH 04/23] Added discord channel and whole server indexing A channel index does 1000 messages Whole server index does 300 messages per channel Might need to add more restrictions since all the index commands are expensive --- cogs/commands.py | 17 ++++- cogs/index_service_cog.py | 18 ++++- models/index_model.py | 150 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 176 insertions(+), 9 deletions(-) diff --git a/cogs/commands.py b/cogs/commands.py index 241a7b2..aae8354 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -503,15 +503,26 @@ class Commands(discord.Cog, name="Commands"): @add_to_group("index") @discord.slash_command( - name="set", + name="set_file", description="Set an index to query from", guild_ids=ALLOWED_GUILDS ) @discord.guild_only() - @discord.option(name="file", description="A file to create the index from", required=True, input_type=discord.Attachment) - async def set(self, ctx:discord.ApplicationContext, file: discord.Attachment): + @discord.option(name="file", description="A file to create the index from", required=True, input_type=discord.SlashCommandOptionType.attachment) + async def set_file(self, ctx:discord.ApplicationContext, file: discord.Attachment): await self.index_cog.set_index_command(ctx, file) + @add_to_group("index") + @discord.slash_command( + name="set_discord", + description="Set a index from a discord channel", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="channel", description="A channel to create the index from", required=False, input_type=discord.SlashCommandOptionType.channel) + async def set_discord(self, ctx:discord.ApplicationContext, channel: discord.TextChannel): + await self.index_cog.set_discord_command(ctx, channel) + @add_to_group("index") @discord.slash_command( diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index f3dada1..3e33582 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -27,7 +27,23 @@ class IndexService(discord.Cog, name="IndexService"): return await ctx.defer(ephemeral=True) - await self.index_handler.set_index(ctx, file, user_api_key=user_api_key) + await self.index_handler.set_file_index(ctx, file, user_api_key=user_api_key) + + + async def set_discord_command(self, ctx, channel: discord.TextChannel = None): + """Command handler to set a channel as your personal index""" + + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + if not channel: + await self.index_handler.set_discord_index(ctx, channel, user_api_key=user_api_key, no_channel=True) + return + await self.index_handler.set_discord_index(ctx, channel, user_api_key=user_api_key) async def query_command(self, ctx, query): diff --git a/models/index_model.py b/models/index_model.py index 2d6d2af..5c49781 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -2,8 +2,14 @@ import os import traceback import asyncio import tempfile -from functools import partial import discord +from functools import partial +from typing import List, Optional + + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document +from gpt_index.response.schema import Response from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader @@ -19,9 +25,12 @@ class Index_handler: document = SimpleDirectoryReader(file_path).load_data() index = GPTSimpleVectorIndex(document) return index + def index_discord(self, document): + index = GPTSimpleVectorIndex(document) + return index - async def set_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): + async def set_file_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key else: @@ -46,7 +55,32 @@ class Index_handler: await ctx.respond("Failed to set index") traceback.print_exc() - async def query(self, ctx: discord.ApplicationContext, query, user_api_key): + + async def set_discord_index(self, ctx: discord.ApplicationContext, channel: discord.TextChannel, user_api_key, no_channel=False): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + reader = DiscordReader() + if no_channel: + channel_ids:List[int] = [] + for c in ctx.guild.text_channels: + channel_ids.append(c.id) + document = await reader.load_data(channel_ids=channel_ids, limit=300, oldest_first=False) + else: + document = await reader.load_data(channel_ids=[channel.id], limit=1000, oldest_first=False) + index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) + self.index_storage[ctx.user.id] = index + await ctx.respond("Index set") + except Exception: + await ctx.respond("Failed to set index") + traceback.print_exc() + + + + async def query(self, ctx: discord.ApplicationContext, query:str, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key else: @@ -57,5 +91,111 @@ class Index_handler: return index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] - response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True)) - await ctx.respond(f"Query response: {response}") \ No newline at end of file + try: + response: Response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True)) + except Exception: + ctx.respond("You haven't set and index", delete_after=5) + await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") + + +#Set our own version of the DiscordReader class that's async + +class DiscordReader(BaseReader): + """Discord reader. + + Reads conversations from channels. + + Args: + discord_token (Optional[str]): Discord token. If not provided, we + assume the environment variable `DISCORD_TOKEN` is set. + + """ + + def __init__(self, discord_token: Optional[str] = None) -> None: + """Initialize with parameters.""" + if discord_token is None: + discord_token = os.environ["DISCORD_TOKEN"] + if discord_token is None: + raise ValueError( + "Must specify `discord_token` or set environment " + "variable `DISCORD_TOKEN`." + ) + + self.discord_token = discord_token + + async def read_channel(self, channel_id: int, limit: Optional[int], oldest_first: bool) -> str: + """Async read channel.""" + + messages: List[discord.Message] = [] + + class CustomClient(discord.Client): + async def on_ready(self) -> None: + try: + channel = client.get_channel(channel_id) + print(f"Added {channel.name} from {channel.guild.name}") + # only work for text channels for now + if not isinstance(channel, discord.TextChannel): + raise ValueError( + f"Channel {channel_id} is not a text channel. " + "Only text channels are supported for now." + ) + # thread_dict maps thread_id to thread + thread_dict = {} + for thread in channel.threads: + thread_dict[thread.id] = thread + + async for msg in channel.history( + limit=limit, oldest_first=oldest_first + ): + messages.append(msg) + if msg.id in thread_dict: + thread = thread_dict[msg.id] + async for thread_msg in thread.history( + limit=limit, oldest_first=oldest_first + ): + messages.append(thread_msg) + except Exception as e: + print("Encountered error: " + str(e)) + finally: + await self.close() + + intents = discord.Intents.default() + intents.message_content = True + client = CustomClient(intents=intents) + await client.start(self.discord_token) + + msg_txt_list = [f"{m.author.display_name}: {m.content}" for m in messages] + channel = client.get_channel(channel_id) + + return ("\n\n".join(msg_txt_list), channel.name) + + async def load_data( + self, + channel_ids: List[int], + limit: Optional[int] = None, + oldest_first: bool = True, + ) -> List[Document]: + """Load data from the input directory. + + Args: + channel_ids (List[int]): List of channel ids to read. + limit (Optional[int]): Maximum number of messages to read. + oldest_first (bool): Whether to read oldest messages first. + Defaults to `True`. + + Returns: + List[Document]: List of documents. + + """ + results: List[Document] = [] + for channel_id in channel_ids: + if not isinstance(channel_id, int): + raise ValueError( + f"Channel id {channel_id} must be an integer, " + f"not {type(channel_id)}." + ) + (channel_content, channel_name) = await self.read_channel(channel_id, limit=limit, oldest_first=oldest_first) + results.append( + Document(channel_content, extra_info={"channel_id": channel_id, "channel_name": channel_name}) + ) + return results \ No newline at end of file From dd9cb0ce4ccdc3252bdaecb671c11f784abea53e Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Tue, 31 Jan 2023 16:00:12 +0000 Subject: [PATCH 05/23] Add option for response mode on queries --- cogs/commands.py | 13 ++++++++++--- cogs/index_service_cog.py | 4 ++-- models/index_model.py | 33 ++++++++++++++++----------------- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/cogs/commands.py b/cogs/commands.py index aae8354..1075560 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -532,9 +532,16 @@ class Commands(discord.Cog, name="Commands"): ) @discord.guild_only() @discord.option(name="query", description="What to query the index", required=True) - async def query(self, ctx:discord.ApplicationContext, query: str): - await self.index_cog.query_command(ctx, query) - + @discord.option( + name="response_mode", + description="Response mode", + guild_ids=ALLOWED_GUILDS, + required=False, + default="default", + choices=["default", "compact", "tree_summarize"] + ) + async def query(self, ctx:discord.ApplicationContext, query: str, response_mode: str): + await self.index_cog.query_command(ctx, query, response_mode) # diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index 3e33582..41515c5 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -46,7 +46,7 @@ class IndexService(discord.Cog, name="IndexService"): await self.index_handler.set_discord_index(ctx, channel, user_api_key=user_api_key) - async def query_command(self, ctx, query): + async def query_command(self, ctx, query, response_mode): """Command handler to query your index""" user_api_key = None if USER_INPUT_API_KEYS: @@ -55,4 +55,4 @@ class IndexService(discord.Cog, name="IndexService"): return await ctx.defer() - await self.index_handler.query(ctx, query, user_api_key=user_api_key) + await self.index_handler.query(ctx, query, response_mode, user_api_key) diff --git a/models/index_model.py b/models/index_model.py index 5c49781..bcd0fec 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -80,22 +80,18 @@ class Index_handler: - async def query(self, ctx: discord.ApplicationContext, query:str, user_api_key): + async def query(self, ctx: discord.ApplicationContext, query:str, response_mode, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key else: os.environ["OPENAI_API_KEY"] = user_api_key - if not self.index_storage[ctx.user.id]: - await ctx.respond("You need to set an index", ephemeral=True, delete_after=5) - return - - index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] try: - response: Response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True)) + index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] + response: Response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, response_mode=response_mode)) + await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") except Exception: - ctx.respond("You haven't set and index", delete_after=5) - await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") + await ctx.respond("You haven't set and index", delete_after=10) #Set our own version of the DiscordReader class that's async @@ -147,13 +143,16 @@ class DiscordReader(BaseReader): async for msg in channel.history( limit=limit, oldest_first=oldest_first ): - messages.append(msg) - if msg.id in thread_dict: - thread = thread_dict[msg.id] - async for thread_msg in thread.history( - limit=limit, oldest_first=oldest_first - ): - messages.append(thread_msg) + if msg.author.bot: + pass + else: + messages.append(msg) + if msg.id in thread_dict: + thread = thread_dict[msg.id] + async for thread_msg in thread.history( + limit=limit, oldest_first=oldest_first + ): + messages.append(thread_msg) except Exception as e: print("Encountered error: " + str(e)) finally: @@ -164,8 +163,8 @@ class DiscordReader(BaseReader): client = CustomClient(intents=intents) await client.start(self.discord_token) - msg_txt_list = [f"{m.author.display_name}: {m.content}" for m in messages] channel = client.get_channel(channel_id) + msg_txt_list = [f"user:{m.author.display_name}, content:{m.content}" for m in messages] return ("\n\n".join(msg_txt_list), channel.name) From c6a6245e00bbfca18bdd0f69f1c1330612f2ec8a Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Tue, 31 Jan 2023 20:32:23 +0000 Subject: [PATCH 06/23] Added server index backup and loading --- .gitignore | 3 +- cogs/commands.py | 23 +++++ cogs/index_service_cog.py | 29 +++++- models/autocomplete_model.py | 13 +++ models/index_model.py | 185 +++++++++++++++++++---------------- 5 files changed, 161 insertions(+), 92 deletions(-) diff --git a/.gitignore b/.gitignore index a95b25f..3411cda 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ __pycache__ *.sqlite bot.pid usage.txt -/dalleimages \ No newline at end of file +/dalleimages +/indexes \ No newline at end of file diff --git a/cogs/commands.py b/cogs/commands.py index 1075560..93af420 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -501,6 +501,18 @@ class Commands(discord.Cog, name="Commands"): # Index commands # + @add_to_group("index") + @discord.slash_command( + name="load_file", + description="Set an index to query from", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="index", description="Which file to load the index from", required=True, autocomplete=File_autocompleter.get_indexes) + async def load_index(self, ctx:discord.ApplicationContext, index: str): + await self.index_cog.load_index_command(ctx, index) + + @add_to_group("index") @discord.slash_command( name="set_file", @@ -524,6 +536,17 @@ class Commands(discord.Cog, name="Commands"): await self.index_cog.set_discord_command(ctx, channel) + @add_to_group("index") + @discord.slash_command( + name="discord_backup", + description="Save an index made from the whole server", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + async def discord_backup(self, ctx:discord.ApplicationContext): + await self.index_cog.discord_backup_command(ctx) + + @add_to_group("index") @discord.slash_command( name="query", diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index 41515c5..f57cb6a 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -15,7 +15,7 @@ class IndexService(discord.Cog, name="IndexService"): ): super().__init__() self.bot = bot - self.index_handler = Index_handler() + self.index_handler = Index_handler(bot) async def set_index_command(self, ctx, file: discord.Attachment): """Command handler to set a file as your personal index""" @@ -40,11 +40,32 @@ class IndexService(discord.Cog, name="IndexService"): return await ctx.defer(ephemeral=True) - if not channel: - await self.index_handler.set_discord_index(ctx, channel, user_api_key=user_api_key, no_channel=True) - return await self.index_handler.set_discord_index(ctx, channel, user_api_key=user_api_key) + async def discord_backup_command(self, ctx): + """Command handler to backup the entire server""" + + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + await self.index_handler.backup_discord(ctx, user_api_key=user_api_key) + + + async def load_index_command(self, ctx, index): + """Command handler to backup the entire server""" + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + await self.index_handler.load_index(ctx, index, user_api_key) + async def query_command(self, ctx, query, response_mode): """Command handler to query your index""" diff --git a/models/autocomplete_model.py b/models/autocomplete_model.py index 8525cb8..682a13b 100644 --- a/models/autocomplete_model.py +++ b/models/autocomplete_model.py @@ -141,3 +141,16 @@ class File_autocompleter: ] # returns the 25 first files from your current input except Exception: return ["No 'openers' folder"] + + async def get_indexes(ctx: discord.AutocompleteContext): + """get all files in the openers folder""" + try: + return [ + file + for file in os.listdir(EnvService.find_shared_file("indexes")) + if file.startswith(ctx.value.lower()) + ][ + :25 + ] # returns the 25 first files from your current input + except Exception: + return ["No 'indexes' folder"] diff --git a/models/index_model.py b/models/index_model.py index bcd0fec..86afe44 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -1,30 +1,44 @@ import os import traceback import asyncio -import tempfile import discord +import aiofiles from functools import partial from typing import List, Optional +from pathlib import Path +from datetime import date, datetime - -from gpt_index.readers.base import BaseReader from gpt_index.readers.schema.base import Document from gpt_index.response.schema import Response +from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, GPTPineconeIndex -from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader +from services.environment_service import EnvService, app_root_path class Index_handler: - def __init__(self): + def __init__(self, bot): + self.bot = bot self.openai_key = os.getenv("OPENAI_TOKEN") self.index_storage = {} self.loop = asyncio.get_running_loop() + self.qaprompt = QuestionAnswerPrompt( + "Context information is below. The text '<|endofstatement|>' is used to separate chat entries and make it easier for you to understand the context\n" + "---------------------\n" + "{context_str}" + "\n---------------------\n" + "Never say '<|endofstatement|>'\n" + "Given the context information and not prior knowledge, " + "answer the question: {query_str}\n" + ) def index_file(self, file_path): document = SimpleDirectoryReader(file_path).load_data() index = GPTSimpleVectorIndex(document) return index + def index_load_file(self, file_path): + index = GPTSimpleVectorIndex.load_from_disk(file_path) + return index def index_discord(self, document): index = GPTSimpleVectorIndex(document) return index @@ -37,7 +51,6 @@ class Index_handler: os.environ["OPENAI_API_KEY"] = user_api_key try: - temp_path = tempfile.TemporaryDirectory() if file.content_type.startswith("text/plain"): suffix = ".txt" elif file.content_type.startswith("application/pdf"): @@ -45,8 +58,9 @@ class Index_handler: else: await ctx.respond("Only accepts txt or pdf files") return - temp_file = tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path.name, delete=False) - await file.save(temp_file.name) + async with aiofiles.tempfile.TemporaryDirectory() as temp_path: + async with aiofiles.tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path.name, delete=False) as temp_file: + await file.save(temp_file.name) index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path.name)) self.index_storage[ctx.user.id] = index temp_path.cleanup() @@ -56,21 +70,14 @@ class Index_handler: traceback.print_exc() - async def set_discord_index(self, ctx: discord.ApplicationContext, channel: discord.TextChannel, user_api_key, no_channel=False): + async def set_discord_index(self, ctx: discord.ApplicationContext, channel: discord.TextChannel, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key else: os.environ["OPENAI_API_KEY"] = user_api_key try: - reader = DiscordReader() - if no_channel: - channel_ids:List[int] = [] - for c in ctx.guild.text_channels: - channel_ids.append(c.id) - document = await reader.load_data(channel_ids=channel_ids, limit=300, oldest_first=False) - else: - document = await reader.load_data(channel_ids=[channel.id], limit=1000, oldest_first=False) + document = await self.load_data(channel_ids=[channel.id], limit=1000, oldest_first=False) index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) self.index_storage[ctx.user.id] = index await ctx.respond("Index set") @@ -78,6 +85,42 @@ class Index_handler: await ctx.respond("Failed to set index") traceback.print_exc() + + async def load_index(self, ctx:discord.ApplicationContext, index, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + index_file = EnvService.find_shared_file(f"indexes/{index}") + index = await self.loop.run_in_executor(None, partial(self.index_load_file, index_file)) + self.index_storage[ctx.user.id] = index + await ctx.respond("Loaded index") + except Exception as e: + await ctx.respond(e) + + + async def backup_discord(self, ctx: discord.ApplicationContext, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + channel_ids:List[int] = [] + for c in ctx.guild.text_channels: + channel_ids.append(c.id) + document = await self.load_data(channel_ids=channel_ids, limit=1000, oldest_first=False) + index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) + Path(app_root_path() / "indexes").mkdir(parents = True, exist_ok=True) + index.save_to_disk(app_root_path() / "indexes" / f"{ctx.guild.name.replace(' ', '-')}_{date.today()}-H{datetime.now().hour}.json") + + await ctx.respond("Backup saved") + except Exception: + await ctx.respond("Failed to save backup") + traceback.print_exc() + async def query(self, ctx: discord.ApplicationContext, query:str, response_mode, user_api_key): @@ -85,88 +128,56 @@ class Index_handler: os.environ["OPENAI_API_KEY"] = self.openai_key else: os.environ["OPENAI_API_KEY"] = user_api_key - + try: index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] - response: Response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, response_mode=response_mode)) + response: Response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, response_mode=response_mode, text_qa_template=self.qaprompt)) await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") except Exception: - await ctx.respond("You haven't set and index", delete_after=10) - - -#Set our own version of the DiscordReader class that's async - -class DiscordReader(BaseReader): - """Discord reader. - - Reads conversations from channels. + await ctx.respond("Failed to send query", delete_after=10) - Args: - discord_token (Optional[str]): Discord token. If not provided, we - assume the environment variable `DISCORD_TOKEN` is set. - - """ - - def __init__(self, discord_token: Optional[str] = None) -> None: - """Initialize with parameters.""" - if discord_token is None: - discord_token = os.environ["DISCORD_TOKEN"] - if discord_token is None: - raise ValueError( - "Must specify `discord_token` or set environment " - "variable `DISCORD_TOKEN`." - ) - - self.discord_token = discord_token + # Extracted functions from DiscordReader async def read_channel(self, channel_id: int, limit: Optional[int], oldest_first: bool) -> str: """Async read channel.""" messages: List[discord.Message] = [] - class CustomClient(discord.Client): - async def on_ready(self) -> None: - try: - channel = client.get_channel(channel_id) - print(f"Added {channel.name} from {channel.guild.name}") - # only work for text channels for now - if not isinstance(channel, discord.TextChannel): - raise ValueError( - f"Channel {channel_id} is not a text channel. " - "Only text channels are supported for now." - ) - # thread_dict maps thread_id to thread - thread_dict = {} - for thread in channel.threads: - thread_dict[thread.id] = thread - - async for msg in channel.history( - limit=limit, oldest_first=oldest_first - ): - if msg.author.bot: - pass - else: - messages.append(msg) - if msg.id in thread_dict: - thread = thread_dict[msg.id] - async for thread_msg in thread.history( - limit=limit, oldest_first=oldest_first - ): - messages.append(thread_msg) - except Exception as e: - print("Encountered error: " + str(e)) - finally: - await self.close() - - intents = discord.Intents.default() - intents.message_content = True - client = CustomClient(intents=intents) - await client.start(self.discord_token) - - channel = client.get_channel(channel_id) + + try: + channel = self.bot.get_channel(channel_id) + print(f"Added {channel.name} from {channel.guild.name}") + # only work for text channels for now + if not isinstance(channel, discord.TextChannel): + raise ValueError( + f"Channel {channel_id} is not a text channel. " + "Only text channels are supported for now." + ) + # thread_dict maps thread_id to thread + thread_dict = {} + for thread in channel.threads: + thread_dict[thread.id] = thread + + async for msg in channel.history( + limit=limit, oldest_first=oldest_first + ): + if msg.author.bot: + pass + else: + messages.append(msg) + if msg.id in thread_dict: + thread = thread_dict[msg.id] + async for thread_msg in thread.history( + limit=limit, oldest_first=oldest_first + ): + messages.append(thread_msg) + except Exception as e: + print("Encountered error: " + str(e)) + + channel = self.bot.get_channel(channel_id) msg_txt_list = [f"user:{m.author.display_name}, content:{m.content}" for m in messages] - return ("\n\n".join(msg_txt_list), channel.name) + return ("<|endofstatement|>\n\n".join(msg_txt_list), channel.name) async def load_data( self, @@ -195,6 +206,6 @@ class DiscordReader(BaseReader): ) (channel_content, channel_name) = await self.read_channel(channel_id, limit=limit, oldest_first=oldest_first) results.append( - Document(channel_content, extra_info={"channel_id": channel_id, "channel_name": channel_name}) + Document(channel_content, extra_info={"channel_name": channel_name}) ) return results \ No newline at end of file From 842c7241a34ad8587bbacf358ef3890a96a73e53 Mon Sep 17 00:00:00 2001 From: Rene Teigen Date: Tue, 31 Jan 2023 21:08:04 +0000 Subject: [PATCH 07/23] Removed some imports --- models/index_model.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/models/index_model.py b/models/index_model.py index 86afe44..dff686b 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -9,8 +9,7 @@ from pathlib import Path from datetime import date, datetime from gpt_index.readers.schema.base import Document -from gpt_index.response.schema import Response -from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, GPTPineconeIndex +from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt from services.environment_service import EnvService, app_root_path @@ -61,7 +60,7 @@ class Index_handler: async with aiofiles.tempfile.TemporaryDirectory() as temp_path: async with aiofiles.tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path.name, delete=False) as temp_file: await file.save(temp_file.name) - index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path.name)) + index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path.name)) self.index_storage[ctx.user.id] = index temp_path.cleanup() await ctx.respond("Index set") @@ -131,7 +130,7 @@ class Index_handler: try: index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] - response: Response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, response_mode=response_mode, text_qa_template=self.qaprompt)) + response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, response_mode=response_mode, text_qa_template=self.qaprompt)) await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") except Exception: await ctx.respond("Failed to send query", delete_after=10) From 5d406b666ba3e13b10c25b26c538d52d11784286 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Tue, 31 Jan 2023 22:40:03 -0500 Subject: [PATCH 08/23] remove .cleanup() for temp_path (it gets auto cleaned up its ok --- models/index_model.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/models/index_model.py b/models/index_model.py index dff686b..0ac6682 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -58,17 +58,15 @@ class Index_handler: await ctx.respond("Only accepts txt or pdf files") return async with aiofiles.tempfile.TemporaryDirectory() as temp_path: - async with aiofiles.tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path.name, delete=False) as temp_file: + async with aiofiles.tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path, delete=False) as temp_file: await file.save(temp_file.name) - index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path.name)) + index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path)) self.index_storage[ctx.user.id] = index - temp_path.cleanup() await ctx.respond("Index set") except Exception: await ctx.respond("Failed to set index") traceback.print_exc() - async def set_discord_index(self, ctx: discord.ApplicationContext, channel: discord.TextChannel, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key From e63c6227ddaed264a68b89074cc05c6386c0c152 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Tue, 31 Jan 2023 23:19:26 -0500 Subject: [PATCH 09/23] initial link loading --- cogs/commands.py | 9 +++++---- cogs/index_service_cog.py | 14 ++++++++++++-- models/index_model.py | 31 +++++++++++++++++++++++++++++-- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/cogs/commands.py b/cogs/commands.py index 93af420..0b00dd0 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -515,14 +515,15 @@ class Commands(discord.Cog, name="Commands"): @add_to_group("index") @discord.slash_command( - name="set_file", + name="set", description="Set an index to query from", guild_ids=ALLOWED_GUILDS ) @discord.guild_only() - @discord.option(name="file", description="A file to create the index from", required=True, input_type=discord.SlashCommandOptionType.attachment) - async def set_file(self, ctx:discord.ApplicationContext, file: discord.Attachment): - await self.index_cog.set_index_command(ctx, file) + @discord.option(name="file", description="A file to create the index from", required=False, input_type=discord.SlashCommandOptionType.attachment) + @discord.option(name="link", description="A link to a file to a webpage ", required=False, input_type=str) + async def set_file(self, ctx:discord.ApplicationContext, file: discord.Attachment, link: str): + await self.index_cog.set_index_command(ctx, file, link) @add_to_group("index") @discord.slash_command( diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index f57cb6a..6841739 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -17,8 +17,15 @@ class IndexService(discord.Cog, name="IndexService"): self.bot = bot self.index_handler = Index_handler(bot) - async def set_index_command(self, ctx, file: discord.Attachment): + async def set_index_command(self, ctx, file: discord.Attachment = None, link: str = None): """Command handler to set a file as your personal index""" + if not file and not link: + await ctx.respond("Please provide a file or a link") + return + + if file and link: + await ctx.respond("Please provide only one file or link. Only one or the other.") + return user_api_key = None if USER_INPUT_API_KEYS: @@ -27,7 +34,10 @@ class IndexService(discord.Cog, name="IndexService"): return await ctx.defer(ephemeral=True) - await self.index_handler.set_file_index(ctx, file, user_api_key=user_api_key) + if file: + await self.index_handler.set_file_index(ctx, file, user_api_key=user_api_key) + elif link: + await self.index_handler.set_link_index(ctx, link, user_api_key=user_api_key) async def set_discord_command(self, ctx, channel: discord.TextChannel = None): diff --git a/models/index_model.py b/models/index_model.py index 0ac6682..8c3880e 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -9,7 +9,9 @@ from pathlib import Path from datetime import date, datetime from gpt_index.readers.schema.base import Document -from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt +from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, BeautifulSoupWebReader, \ + GPTFaissIndex +from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR from services.environment_service import EnvService, app_root_path @@ -41,6 +43,11 @@ class Index_handler: def index_discord(self, document): index = GPTSimpleVectorIndex(document) return index + + def index_webpage(self, url): + documents = BeautifulSoupWebReader(website_extractor=DEFAULT_WEBSITE_EXTRACTOR).load_data(urls=[url]) + index = GPTSimpleVectorIndex(documents) + return index async def set_file_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): @@ -67,6 +74,26 @@ class Index_handler: await ctx.respond("Failed to set index") traceback.print_exc() + async def set_link_index(self, ctx: discord.ApplicationContext, link: str, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + # TODO Link validation + try: + + index = await self.loop.run_in_executor(None, partial(self.index_webpage, link)) + + self.index_storage[ctx.user.id] = index + + except Exception: + await ctx.respond("Failed to set index") + traceback.print_exc() + + await ctx.respond("Index set") + + async def set_discord_index(self, ctx: discord.ApplicationContext, channel: discord.TextChannel, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key @@ -205,4 +232,4 @@ class Index_handler: results.append( Document(channel_content, extra_info={"channel_name": channel_name}) ) - return results \ No newline at end of file + return results From c7e6a447d3db4994028248c37913febdba902bd9 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Tue, 31 Jan 2023 23:27:51 -0500 Subject: [PATCH 10/23] Role checks --- README.md | 24 +++++++++++------------- cogs/commands.py | 4 ++-- gpt3discord.py | 2 +- models/check_model.py | 18 ++++++++++++++++++ models/index_model.py | 3 +-- services/environment_service.py | 26 ++++++++++++++++++++++++++ 6 files changed, 59 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 8054d5c..35c802c 100644 --- a/README.md +++ b/README.md @@ -23,26 +23,16 @@ SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the

# Recent Notable Updates -- **Translations with DeepL** - DeepL integration for translations. `/translate` - - -- **Context menu commands** - Allow people to prompt GPT and DALL-E directly by right clicking a message: -
-
- - -- **AI-BASED SERVER MODERATION** - GPT3Discord now has a built-in AI-based moderation system that can automatically detect and remove toxic messages from your server. This is a great way to keep your server safe and clean, and it's completely automatic and **free**! Check out the commands section to learn how to enable it! - - -- **Permanent memory with embeddings and Pinecone finished!** - An initial alpha version of permanent memory is now done! This allows you to chat with GPT3 infinitely and accurately, and save tokens, by using embeddings. *Please read the Permanent Memory section for more information!* - +- **CUSTOM INDEXES** - This is a huge update. You can now upload files to your server and use them as custom context when asking GPT3 questions. You can also use links to use webpages as context, and you can even use discord channels, or your entire discord server's messages as context! Read more in the 'Custom Indexes' section below. # Features - **Directly prompt GPT3 with `/gpt ask `** - **Have long term, permanent conversations with the bot, just like chatgpt, with `/gpt converse`** - Conversations happen in threads that get automatically cleaned up! +- **Custom Indexes** - Use your own files, pdfs, txt files, websites, discord channel content as context when asking GPT3 questions! + - **DALL-E Image Generation** - Generate DALL-E AI images right in discord with `/dalle draw `! It even supports multiple image qualities, multiple images, creating image variants, retrying, and saving images. - **DALL-E Image Prompt Optimization** - Given some text that you're trying to generate an image for, the bot will automatically optimize the text to be more DALL-E friendly! `/dalle optimize ` @@ -105,6 +95,10 @@ These commands are grouped, so each group has a prefix but you can easily tab co `/dalle optimize ` Optimize a given prompt text for DALL-E image generation. +### Custom Indexes Commands + +TODO + ### System and Settings `/system settings` - Display settings for the model (temperature, top_p, etc) @@ -121,6 +115,10 @@ These commands are grouped, so each group has a prefix but you can easily tab co `/system clear-local` - Clear all the local dalleimages. +### Custom Indexes + +TODO + ### Automatic AI Moderation `/mod set status:on` - Turn on automatic chat moderations. diff --git a/cogs/commands.py b/cogs/commands.py index 0b00dd0..6650b6b 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -71,9 +71,9 @@ class Commands(discord.Cog, name="Commands"): ) index = discord.SlashCommandGroup( name="index", - description="gpt-index commands", + description="Custom index commands for the bot", guild_ids=ALLOWED_GUILDS, - checks=[Check.check_gpt_roles()], + checks=[Check.check_index_roles()], ) # diff --git a/gpt3discord.py b/gpt3discord.py index 38034a8..41e99d5 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -69,7 +69,7 @@ if PINECONE_TOKEN: and EnvService.get_google_search_engine_id() ): if PINECONE_INDEX_SEARCH not in pinecone.list_indexes(): - print("Creating pinecone index for seraches. Please wait...") + print("Creating pinecone index for searches. Please wait...") pinecone.create_index( PINECONE_INDEX_SEARCH, dimension=1536, diff --git a/models/check_model.py b/models/check_model.py index b7ebd77..de63e3e 100644 --- a/models/check_model.py +++ b/models/check_model.py @@ -6,6 +6,7 @@ from typing import Callable ADMIN_ROLES = EnvService.get_admin_roles() DALLE_ROLES = EnvService.get_dalle_roles() GPT_ROLES = EnvService.get_gpt_roles() +INDEX_ROLES = EnvService.get_index_roles() TRANSLATOR_ROLES = EnvService.get_translator_roles() ALLOWED_GUILDS = EnvService.get_allowed_guilds() @@ -63,6 +64,23 @@ class Check: return inner + @staticmethod + def check_index_roles() -> Callable: + async def inner(ctx: discord.ApplicationContext): + if INDEX_ROLES == [None]: + return True + if not any(role.name.lower() in INDEX_ROLES for role in ctx.user.roles): + await ctx.defer(ephemeral=True) + await ctx.respond( + f"You don't have permission, list of roles is {INDEX_ROLES}", + ephemeral=True, + delete_after=10, + ) + return False + return True + + return inner + @staticmethod def check_translator_roles() -> Callable: async def inner(ctx: discord.ApplicationContext): diff --git a/models/index_model.py b/models/index_model.py index 8c3880e..0f5bae0 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -48,8 +48,7 @@ class Index_handler: documents = BeautifulSoupWebReader(website_extractor=DEFAULT_WEBSITE_EXTRACTOR).load_data(urls=[url]) index = GPTSimpleVectorIndex(documents) return index - - + async def set_file_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key diff --git a/services/environment_service.py b/services/environment_service.py index ad098fd..e60a315 100644 --- a/services/environment_service.py +++ b/services/environment_service.py @@ -191,6 +191,32 @@ class EnvService: ) return gpt_roles + @staticmethod + def get_index_roles(): + # GPT_ROLES is a comma separated list of string roles + # It can also just be one role + # Read these allowed roles and return as a list of strings + try: + index_roles = os.getenv("INDEX_ROLES") + except Exception: + index_roles = None + + if index_roles is None: + print( + "INDEX_ROLES is not defined properly in the environment file!" + "Please copy your server's role and put it into INDEX_ROLES in the .env file." + 'For example a line should look like: `INDEX_ROLES="Gpt"`' + ) + print("Defaulting to allowing all users to use Index commands...") + return [None] + + index_roles = ( + index_roles.lower().strip().split(",") + if "," in index_roles + else [index_roles.lower()] + ) + return index_roles + @staticmethod def get_welcome_message(): # WELCOME_MESSAGE is a default string used to welcome new members to the server if GPT3 is not available. From c10a2f0237a485a4cc22a9b41969d9d6cd4bd976 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Wed, 1 Feb 2023 10:25:17 -0500 Subject: [PATCH 11/23] hotfix for docker --- Dockerfile | 2 ++ gpt3discord.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 34316ab..94c28f6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ ARG PY_VERSION=3.9 FROM python:${PY_VERSION} as base FROM base as builder ARG PY_VERSION +COPY . . RUN mkdir /install /src WORKDIR /install @@ -23,6 +24,7 @@ RUN pip install --target="/install" /src # Copy minimal to main image (to keep as small as possible) FROM python:${PY_VERSION}-slim ARG PY_VERSION +COPY . . COPY --from=builder /install /usr/local/lib/python${PY_VERSION}/site-packages RUN mkdir -p /opt/gpt3discord/etc COPY gpt3discord.py /opt/gpt3discord/bin/ diff --git a/gpt3discord.py b/gpt3discord.py index 41e99d5..2708bdc 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -31,7 +31,7 @@ from services.environment_service import EnvService from models.openai_model import Model -__version__ = "9.0.1" +__version__ = "9.0.2" PID_FILE = Path("bot.pid") From 22ba80c469df00101020c4ed16edbe44f176ca6b Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Feb 2023 20:41:53 -0500 Subject: [PATCH 12/23] Multi-index support --- models/autocomplete_model.py | 2 +- models/index_model.py | 59 ++++++++++++++++++++++++++++++------ requirements.txt | 2 +- 3 files changed, 51 insertions(+), 12 deletions(-) diff --git a/models/autocomplete_model.py b/models/autocomplete_model.py index 682a13b..81654f6 100644 --- a/models/autocomplete_model.py +++ b/models/autocomplete_model.py @@ -147,7 +147,7 @@ class File_autocompleter: try: return [ file - for file in os.listdir(EnvService.find_shared_file("indexes")) + for file in os.listdir(EnvService.find_shared_file(f"indexes/{str(ctx.interaction.user.id)}/")) if file.startswith(ctx.value.lower()) ][ :25 diff --git a/models/index_model.py b/models/index_model.py index 0f5bae0..586728d 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -1,6 +1,8 @@ import os import traceback import asyncio +from collections import defaultdict + import discord import aiofiles from functools import partial @@ -10,18 +12,44 @@ from datetime import date, datetime from gpt_index.readers.schema.base import Document from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, BeautifulSoupWebReader, \ - GPTFaissIndex + GPTFaissIndex, GPTListIndex, QueryMode, GPTTreeIndex from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR +from gpt_index.composability import ComposableGraph + from services.environment_service import EnvService, app_root_path +class IndexData: + def __init__(self): + self.queryable_index = None + self.individual_indexes = [] + + # A safety check for the future + def get_index_or_throw(self): + if not self.queryable(): + raise Exception("An index access was attempted before an index was created. This is a programmer error, please report this to the maintainers.") + return self.queryable_index + def queryable(self): + return self.queryable_index is not None + + def add_index(self, index, user_id, file_name): + self.individual_indexes.append(index) + self.queryable_index = index + + # Create a folder called "indexes/{USER_ID}" if it doesn't exist already + Path(f"{app_root_path()}/indexes/{user_id}").mkdir(parents=True, exist_ok=True) + print(f"{app_root_path()}/indexes/{user_id}") + # Save the index to file under the user id + index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today()}-H{datetime.now().hour}.json") + + class Index_handler: def __init__(self, bot): self.bot = bot self.openai_key = os.getenv("OPENAI_TOKEN") - self.index_storage = {} + self.index_storage = defaultdict(IndexData) self.loop = asyncio.get_running_loop() self.qaprompt = QuestionAnswerPrompt( "Context information is below. The text '<|endofstatement|>' is used to separate chat entries and make it easier for you to understand the context\n" @@ -67,8 +95,11 @@ class Index_handler: async with aiofiles.tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path, delete=False) as temp_file: await file.save(temp_file.name) index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path)) - self.index_storage[ctx.user.id] = index - await ctx.respond("Index set") + + file_name = file.filename + self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) + + await ctx.respond("Index added to your indexes") except Exception: await ctx.respond("Failed to set index") traceback.print_exc() @@ -84,7 +115,10 @@ class Index_handler: index = await self.loop.run_in_executor(None, partial(self.index_webpage, link)) - self.index_storage[ctx.user.id] = index + # Make the url look nice, remove https, useless stuff, random characters + file_name = link.replace("https://", "").replace("http://", "").replace("www.", "").replace("/", "_").replace("?", "_").replace("&", "_").replace("=", "_").replace("-", "_").replace(".", "_") + + self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) except Exception: await ctx.respond("Failed to set index") @@ -102,7 +136,7 @@ class Index_handler: try: document = await self.load_data(channel_ids=[channel.id], limit=1000, oldest_first=False) index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) - self.index_storage[ctx.user.id] = index + self.index_storage[ctx.user.id].add_index(index, ctx.user.id, channel.name) await ctx.respond("Index set") except Exception: await ctx.respond("Failed to set index") @@ -116,9 +150,9 @@ class Index_handler: os.environ["OPENAI_API_KEY"] = user_api_key try: - index_file = EnvService.find_shared_file(f"indexes/{index}") + index_file = EnvService.find_shared_file(f"indexes/{ctx.user.id}/{index}") index = await self.loop.run_in_executor(None, partial(self.index_load_file, index_file)) - self.index_storage[ctx.user.id] = index + self.index_storage[ctx.user.id].queryable_index = index await ctx.respond("Loaded index") except Exception as e: await ctx.respond(e) @@ -153,10 +187,15 @@ class Index_handler: os.environ["OPENAI_API_KEY"] = user_api_key try: - index: GPTSimpleVectorIndex = self.index_storage[ctx.user.id] - response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, response_mode=response_mode, text_qa_template=self.qaprompt)) + index: [GPTSimpleVectorIndex, ComposableGraph] = self.index_storage[ctx.user.id].get_index_or_throw() + if isinstance(index, GPTSimpleVectorIndex): + response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, text_qa_template=self.qaprompt)) + else: + response = await self.loop.run_in_executor(None, + partial(index.query, query, query_configs=[], verbose=True)) await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") except Exception: + traceback.print_exc() await ctx.respond("Failed to send query", delete_after=10) # Extracted functions from DiscordReader diff --git a/requirements.txt b/requirements.txt index 91c86b1..d73e518 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,5 @@ sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.2 beautifulsoup4==4.11.1 -gpt-index==0.2.16 +gpt-index==0.3.4 PyPDF2==3.0.1 \ No newline at end of file From b60bcfd68ddbf1938698a4f8372caf89aa60e0e0 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Feb 2023 20:50:46 -0500 Subject: [PATCH 13/23] reset indexes command --- cogs/commands.py | 20 +++++++++++++++----- cogs/index_service_cog.py | 11 +++++++++++ models/index_model.py | 20 ++++++++++++++++++-- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/cogs/commands.py b/cogs/commands.py index 6650b6b..591a96f 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -503,8 +503,8 @@ class Commands(discord.Cog, name="Commands"): @add_to_group("index") @discord.slash_command( - name="load_file", - description="Set an index to query from", + name="load", + description="Select one of your saved indexes to query from", guild_ids=ALLOWED_GUILDS ) @discord.guild_only() @@ -515,8 +515,8 @@ class Commands(discord.Cog, name="Commands"): @add_to_group("index") @discord.slash_command( - name="set", - description="Set an index to query from", + name="add", + description="Add an index to query from", guild_ids=ALLOWED_GUILDS ) @discord.guild_only() @@ -527,7 +527,17 @@ class Commands(discord.Cog, name="Commands"): @add_to_group("index") @discord.slash_command( - name="set_discord", + name="reset", + description="Reset (delete) all of your saved indexes", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + async def reset(self, ctx:discord.ApplicationContext): + await self.index_cog.reset_command(ctx) + + @add_to_group("index") + @discord.slash_command( + name="add_discord", description="Set a index from a discord channel", guild_ids=ALLOWED_GUILDS ) diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index 6841739..4cb0d84 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -1,3 +1,5 @@ +import traceback + import discord from services.environment_service import EnvService @@ -52,6 +54,15 @@ class IndexService(discord.Cog, name="IndexService"): await ctx.defer(ephemeral=True) await self.index_handler.set_discord_index(ctx, channel, user_api_key=user_api_key) + async def reset_command(self, ctx): + await ctx.defer(ephemeral=True) + try: + self.index_handler.reset_indexes(ctx.user.id) + await ctx.respond("Your indexes have been reset") + except: + traceback.print_exc() + await ctx.respond("Something went wrong while resetting your indexes. Contact the server admin.") + async def discord_backup_command(self, ctx): """Command handler to backup the entire server""" diff --git a/models/index_model.py b/models/index_model.py index 586728d..a927db3 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -44,6 +44,19 @@ class IndexData: # Save the index to file under the user id index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today()}-H{datetime.now().hour}.json") + def reset_indexes(self, user_id): + self.individual_indexes = [] + self.queryable_index = None + + # Delete the user indexes + try: + # First, clear all the files inside it + for file in os.listdir(f"{app_root_path()}/indexes/{user_id}"): + os.remove(f"{app_root_path()}/indexes/{user_id}/{file}") + + except: + traceback.print_exc() + pass class Index_handler: def __init__(self, bot): @@ -77,6 +90,9 @@ class Index_handler: index = GPTSimpleVectorIndex(documents) return index + def reset_indexes(self, user_id): + self.index_storage[user_id].reset_indexes(user_id) + async def set_file_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key @@ -168,7 +184,7 @@ class Index_handler: channel_ids:List[int] = [] for c in ctx.guild.text_channels: channel_ids.append(c.id) - document = await self.load_data(channel_ids=channel_ids, limit=1000, oldest_first=False) + document = await self.load_data(channel_ids=channel_ids, limit=3000, oldest_first=False) index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) Path(app_root_path() / "indexes").mkdir(parents = True, exist_ok=True) index.save_to_disk(app_root_path() / "indexes" / f"{ctx.guild.name.replace(' ', '-')}_{date.today()}-H{datetime.now().hour}.json") @@ -196,7 +212,7 @@ class Index_handler: await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") except Exception: traceback.print_exc() - await ctx.respond("Failed to send query", delete_after=10) + await ctx.respond("Failed to send query. You may not have an index set, load an index with /index load", delete_after=10) # Extracted functions from DiscordReader From 3ed55b556ffa1f9b8855d8f75edd4cb8509d471f Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Feb 2023 21:06:30 -0500 Subject: [PATCH 14/23] Support youtube videos --- models/index_model.py | 22 +++++++++++++++++++--- pyproject.toml | 1 + requirements.txt | 3 ++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/models/index_model.py b/models/index_model.py index a927db3..418beda 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -10,9 +10,10 @@ from typing import List, Optional from pathlib import Path from datetime import date, datetime +from gpt_index.readers import YoutubeTranscriptReader from gpt_index.readers.schema.base import Document from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, BeautifulSoupWebReader, \ - GPTFaissIndex, GPTListIndex, QueryMode, GPTTreeIndex + GPTFaissIndex, GPTListIndex, QueryMode, GPTTreeIndex, GoogleDocsReader from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR from gpt_index.composability import ComposableGraph @@ -78,9 +79,21 @@ class Index_handler: document = SimpleDirectoryReader(file_path).load_data() index = GPTSimpleVectorIndex(document) return index + + def index_gdoc(self, doc_id): + document = GoogleDocsReader().load_data(doc_id) + index = GPTSimpleVectorIndex(document) + return index + + def index_youtube_transcript(self, link): + documents = YoutubeTranscriptReader().load_data(ytlinks=[link]) + index = GPTSimpleVectorIndex(documents) + return index + def index_load_file(self, file_path): index = GPTSimpleVectorIndex.load_from_disk(file_path) return index + def index_discord(self, document): index = GPTSimpleVectorIndex(document) return index @@ -128,8 +141,11 @@ class Index_handler: # TODO Link validation try: - - index = await self.loop.run_in_executor(None, partial(self.index_webpage, link)) + # Check if the link contains youtube in it + if "youtube" in link: + index = await self.loop.run_in_executor(None, partial(self.index_youtube_transcript, link)) + else: + index = await self.loop.run_in_executor(None, partial(self.index_webpage, link)) # Make the url look nice, remove https, useless stuff, random characters file_name = link.replace("https://", "").replace("http://", "").replace("www.", "").replace("/", "_").replace("?", "_").replace("&", "_").replace("=", "_").replace("-", "_").replace(".", "_") diff --git a/pyproject.toml b/pyproject.toml index a9a20b0..5bf61cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "beautifulsoup4", "gpt-index", "PyPDF2", + "youtube_transcript_api", ] dynamic = ["version"] [project.scripts] diff --git a/requirements.txt b/requirements.txt index d73e518..2e5c533 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ backoff==2.2.1 flask==2.2.2 beautifulsoup4==4.11.1 gpt-index==0.3.4 -PyPDF2==3.0.1 \ No newline at end of file +PyPDF2==3.0.1 +youtube_transcript_api==0.5.0 \ No newline at end of file From 464f3329153a4298e0725b5eb727cd27f4fa8465 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Feb 2023 21:48:00 -0500 Subject: [PATCH 15/23] csv, powerpoint, image, mp3 support --- README.md | 26 ++++++++++++++++++++++++-- models/index_model.py | 15 ++++++++++++++- pyproject.toml | 5 +++++ requirements.txt | 6 +++++- 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 35c802c..4c789c5 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the # Recent Notable Updates -- **CUSTOM INDEXES** - This is a huge update. You can now upload files to your server and use them as custom context when asking GPT3 questions. You can also use links to use webpages as context, and you can even use discord channels, or your entire discord server's messages as context! Read more in the 'Custom Indexes' section below. +- **CUSTOM INDEXES** - This is a huge update. You can now upload files to your server and use them as custom context when asking GPT3 questions. You can also use webpage links as context, images, full documents, csvs, powerpoints, audio files, and even **youtube videos**! Read more in the 'Custom Indexes' section below. # Features - **Directly prompt GPT3 with `/gpt ask `** @@ -97,7 +97,18 @@ These commands are grouped, so each group has a prefix but you can easily tab co ### Custom Indexes Commands -TODO +`/index add file: or link:` - Use a document or use a link to create/add to your indexes. If you provide a youtube link, the transcript of the video will be used. If you provide a web url, the contents of the webpage will be used, if you provide an image, the image text will be extracted and used! + +`/index query query:` - Query your current index for a given prompt. GPT will answer based on your current document/indedx + +`/index load index:` - Load a previously created index to query + +`/index reset` - Reset and delete all of your saved indexes + +`/index add_discord channel:` - Create an add an index based on a discord channel + +`/index discord_backup` - Use the last 3000 messages of every channel on your discord server as an index + ### System and Settings @@ -238,6 +249,15 @@ For example, if I wanted to change the number of images generated by DALL-E by d # Requirements +**For OCR, and document functionalities**: +`pip3 install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html` +or +`python3.9 -m pip install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html` + +**For audio extraction for indexing from .mp3 and .mp4 files**: +`python3.9 -m pip install git+https://github.com/openai/whisper.git` + +**All other dependencies**: `python3.9 -m pip install -r requirements.txt` **I recommend using python 3.9!** @@ -319,6 +339,7 @@ python3.9 get-pip.py # Install project dependencies python3.9 -m pip install --ignore-installed PyYAML +python3.9 -m pip install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html python3.9 -m pip install -r requirements.txt python3.9 -m pip install . @@ -375,6 +396,7 @@ python3.9 -m pip install . With python3.9 installed and the requirements installed, you can run this bot anywhere. Install the dependencies with: +`pip3 install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html` `python3.9 -m pip install -r requirements.txt` Then, run the bot with: diff --git a/models/index_model.py b/models/index_model.py index 418beda..88501e6 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -113,10 +113,23 @@ class Index_handler: os.environ["OPENAI_API_KEY"] = user_api_key try: + print(file.content_type) if file.content_type.startswith("text/plain"): suffix = ".txt" elif file.content_type.startswith("application/pdf"): suffix = ".pdf" + # Allow for images too + elif file.content_type.startswith("image/png"): + suffix = ".png" + elif file.content_type.startswith("image/"): + suffix = ".jpg" + elif "csv" in file.content_type: + suffix = ".csv" + elif "vnd." in file.content_type: + suffix = ".pptx" + # Catch all audio files and suffix with "mp3" + elif file.content_type.startswith("audio/"): + suffix = ".mp3" else: await ctx.respond("Only accepts txt or pdf files") return @@ -128,7 +141,7 @@ class Index_handler: file_name = file.filename self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) - await ctx.respond("Index added to your indexes") + await ctx.respond("Index added to your indexes.") except Exception: await ctx.respond("Failed to set index") traceback.print_exc() diff --git a/pyproject.toml b/pyproject.toml index 5bf61cd..04bc610 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,12 @@ dependencies = [ "gpt-index", "PyPDF2", "youtube_transcript_api", + "sentence-transformers", + "sentencepiece", + "protobuf", + "python-pptx", ] + dynamic = ["version"] [project.scripts] gpt3discord = "gpt3discord:init" diff --git a/requirements.txt b/requirements.txt index 2e5c533..d0ac6f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,8 @@ flask==2.2.2 beautifulsoup4==4.11.1 gpt-index==0.3.4 PyPDF2==3.0.1 -youtube_transcript_api==0.5.0 \ No newline at end of file +youtube_transcript_api==0.5.0 +sentencepiece==0.1.97 +sentence-transformers==2.2.2 +protobuf==3.20.0 +python-pptx==0.6.21 \ No newline at end of file From c93bcfa59a17d3311622c5b6c4f80ed3261de6a7 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Feb 2023 22:24:14 -0500 Subject: [PATCH 16/23] update readme --- README.md | 36 +++++++++++++++++++++--------------- models/index_model.py | 5 ++++- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 4c789c5..e4150bc 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,8 @@ These commands are grouped, so each group has a prefix but you can easily tab co ### Custom Indexes Commands +This bot supports per-user custom indexes. This means that users can upload files of their choosing, such as PDFs and ask GPT to answer questions based on those files. + `/index add file: or link:` - Use a document or use a link to create/add to your indexes. If you provide a youtube link, the transcript of the video will be used. If you provide a web url, the contents of the webpage will be used, if you provide an image, the image text will be extracted and used! `/index query query:` - Query your current index for a given prompt. GPT will answer based on your current document/indedx @@ -126,10 +128,6 @@ These commands are grouped, so each group has a prefix but you can easily tab co `/system clear-local` - Clear all the local dalleimages. -### Custom Indexes - -TODO - ### Automatic AI Moderation `/mod set status:on` - Turn on automatic chat moderations. @@ -180,17 +178,14 @@ Then, name the index `conversation-embeddings`, set the dimensions to `1536`, an Permanent memory using pinecone is still in alpha, I will be working on cleaning up this work, adding auto-clearing, and optimizing for stability and reliability, any help and feedback is appreciated (**add me on Discord Kaveen#0001 for pinecone help**)! If at any time you're having too many issues with pinecone, simply remove the `PINECONE_TOKEN` line in your `.env` file and the bot will revert to using conversation summarizations. -### Permanent overrides in threads -This bot now supports having overrides be permanent in an entire conversation if you use an opener file which includes them. The new opener files should be .json files formatted like this. `text` corresponds to what you want the conversational opener to be and the rest map 1:1 to the appropriate model settings. An example .json file is included by the name of `english_translator.json` in the `openers` folder -```json -{ - "text": "your prompt", - "temp":0, - "top_p":0, - "frequency_penalty":0, - "presence_penalty":0 -} -``` +# Custom Indexes / Knowledgebase +This bot supports per-user custom indexes. This means that users can upload files of their choosing, such as PDFs and ask GPT to answer questions based on those files. We also support using URLs for indexes. + +Supported filetypes: +- All text and data based files (PDF, TXT, DOCX, PPTX, CSV etc) +- Images (JPG, PNG, etc) (Note: The bot will do OCR on the images to extract the text, this requires a lot of processing power sometimes) +- Videos/Audio (MP4, MP3, etc) (Note: The bot will use OpenAI on the audio to extract the text, this requires a lot of processing power sometimes) +- **Youtube Videos** - For all youtube videos that are transcribable, the bot will index the entire transcription of the given youtube video URL! # Translations with DeepL This bot supports and uses DeepL for translations (optionally). If you want to enable the translations service, you can add a line in your `.env` file as follows: @@ -390,6 +385,17 @@ git pull python3.9 -m pip install -r requirements.txt python3.9 -m pip install . ``` +### Permanent overrides in threads +This bot now supports having overrides be permanent in an entire conversation if you use an opener file which includes them. The new opener files should be .json files formatted like this. `text` corresponds to what you want the conversational opener to be and the rest map 1:1 to the appropriate model settings. An example .json file is included by the name of `english_translator.json` in the `openers` folder +```json +{ + "text": "your prompt", + "temp":0, + "top_p":0, + "frequency_penalty":0, + "presence_penalty":0 +} +``` # Non-Server, Non-Docker usage diff --git a/models/index_model.py b/models/index_model.py index 88501e6..fb45a5f 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -130,8 +130,11 @@ class Index_handler: # Catch all audio files and suffix with "mp3" elif file.content_type.startswith("audio/"): suffix = ".mp3" + # Catch video files + elif file.content_type.startswith("video/"): + pass # No suffix change else: - await ctx.respond("Only accepts txt or pdf files") + await ctx.respond("Only accepts text, pdf, images, spreadheets, powerpoint, and audio/video files.") return async with aiofiles.tempfile.TemporaryDirectory() as temp_path: async with aiofiles.tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path, delete=False) as temp_file: From 258a87945c0daaf252c10332441162434d5550d6 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 5 Feb 2023 00:35:49 -0500 Subject: [PATCH 17/23] composability --- cogs/commands.py | 11 +++ cogs/index_service_cog.py | 13 ++- gpt3discord.py | 3 +- models/index_model.py | 163 +++++++++++++++++++++++++++++++++----- 4 files changed, 167 insertions(+), 23 deletions(-) diff --git a/cogs/commands.py b/cogs/commands.py index 591a96f..c97aa9a 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -535,6 +535,17 @@ class Commands(discord.Cog, name="Commands"): async def reset(self, ctx:discord.ApplicationContext): await self.index_cog.reset_command(ctx) + @add_to_group("index") + @discord.slash_command( + name="compose", + description="Combine multiple indexes together", + guild_ids=ALLOWED_GUILDS + ) + @discord.option(name="name", description="The name of the new index", required=False, input_type=discord.SlashCommandOptionType.string) + @discord.guild_only() + async def compose(self, ctx:discord.ApplicationContext, name : str): + await self.index_cog.compose_command(ctx,name) + @add_to_group("index") @discord.slash_command( name="add_discord", diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index 4cb0d84..b7fd70a 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -14,10 +14,11 @@ class IndexService(discord.Cog, name="IndexService"): def __init__( self, bot, + usage_service, ): super().__init__() self.bot = bot - self.index_handler = Index_handler(bot) + self.index_handler = Index_handler(bot, usage_service) async def set_index_command(self, ctx, file: discord.Attachment = None, link: str = None): """Command handler to set a file as your personal index""" @@ -98,3 +99,13 @@ class IndexService(discord.Cog, name="IndexService"): await ctx.defer() await self.index_handler.query(ctx, query, response_mode, user_api_key) + + async def compose_command(self, ctx, name): + """Command handler to compose from your index""" + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await self.index_handler.compose(ctx, name, user_api_key) diff --git a/gpt3discord.py b/gpt3discord.py index 09efeec..2a8c790 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -172,7 +172,8 @@ async def main(): bot.add_cog( IndexService( - bot + bot, + usage_service, ) ) diff --git a/models/index_model.py b/models/index_model.py index fb45a5f..1b4ae62 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -13,7 +13,8 @@ from datetime import date, datetime from gpt_index.readers import YoutubeTranscriptReader from gpt_index.readers.schema.base import Document from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, BeautifulSoupWebReader, \ - GPTFaissIndex, GPTListIndex, QueryMode, GPTTreeIndex, GoogleDocsReader + GPTFaissIndex, GPTListIndex, QueryMode, GPTTreeIndex, GoogleDocsReader, MockLLMPredictor, QueryConfig, \ + IndexStructType from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR from gpt_index.composability import ComposableGraph @@ -21,6 +22,24 @@ from gpt_index.composability import ComposableGraph from services.environment_service import EnvService, app_root_path +def get_and_query(user_id, index_storage, query, llm_predictor): + # TODO Do prediction here for token usage + index: [GPTSimpleVectorIndex, ComposableGraph] = index_storage[user_id].get_index_or_throw() + if isinstance(index, GPTSimpleVectorIndex): + response = index.query(query,verbose=True) + else: + query_configs = [ + QueryConfig( + index_struct_type=IndexStructType.TREE, + query_mode=QueryMode.RECURSIVE, + query_kwargs={ + "child_branch_factor": 4 + } + ) + ] + response = index.query(query, verbose=True, query_configs=[]) + + return response class IndexData: def __init__(self): @@ -41,7 +60,6 @@ class IndexData: # Create a folder called "indexes/{USER_ID}" if it doesn't exist already Path(f"{app_root_path()}/indexes/{user_id}").mkdir(parents=True, exist_ok=True) - print(f"{app_root_path()}/indexes/{user_id}") # Save the index to file under the user id index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today()}-H{datetime.now().hour}.json") @@ -60,11 +78,12 @@ class IndexData: pass class Index_handler: - def __init__(self, bot): + def __init__(self, bot, usage_service): self.bot = bot self.openai_key = os.getenv("OPENAI_TOKEN") self.index_storage = defaultdict(IndexData) self.loop = asyncio.get_running_loop() + self.usage_service = usage_service self.qaprompt = QuestionAnswerPrompt( "Context information is below. The text '<|endofstatement|>' is used to separate chat entries and make it easier for you to understand the context\n" "---------------------\n" @@ -74,31 +93,35 @@ class Index_handler: "Given the context information and not prior knowledge, " "answer the question: {query_str}\n" ) - - def index_file(self, file_path): + + # TODO We need to do predictions below for token usage. + def index_file(self, file_path) -> GPTSimpleVectorIndex: document = SimpleDirectoryReader(file_path).load_data() index = GPTSimpleVectorIndex(document) return index - def index_gdoc(self, doc_id): + def index_gdoc(self, doc_id) -> GPTSimpleVectorIndex: document = GoogleDocsReader().load_data(doc_id) index = GPTSimpleVectorIndex(document) return index def index_youtube_transcript(self, link): documents = YoutubeTranscriptReader().load_data(ytlinks=[link]) - index = GPTSimpleVectorIndex(documents) + index = GPTSimpleVectorIndex(documents,) return index - def index_load_file(self, file_path): - index = GPTSimpleVectorIndex.load_from_disk(file_path) + def index_load_file(self, file_path) -> [GPTSimpleVectorIndex, ComposableGraph]: + if not "composed" in str(file_path): + index = GPTSimpleVectorIndex.load_from_disk(file_path) + else: + index = ComposableGraph.load_from_disk(file_path) return index - def index_discord(self, document): - index = GPTSimpleVectorIndex(document) + def index_discord(self, document) -> GPTSimpleVectorIndex: + index = GPTSimpleVectorIndex(document,) return index - def index_webpage(self, url): + def index_webpage(self, url) -> GPTSimpleVectorIndex: documents = BeautifulSoupWebReader(website_extractor=DEFAULT_WEBSITE_EXTRACTOR).load_data(urls=[url]) index = GPTSimpleVectorIndex(documents) return index @@ -143,7 +166,6 @@ class Index_handler: file_name = file.filename self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) - await ctx.respond("Index added to your indexes.") except Exception: await ctx.respond("Failed to set index") @@ -204,8 +226,42 @@ class Index_handler: await ctx.respond("Loaded index") except Exception as e: await ctx.respond(e) - - + + async def compose_indexes(self, user_id, indexes, name): + # Load all the indexes first + index_objects = [] + for _index in indexes: + index_file = EnvService.find_shared_file(f"indexes/{user_id}/{_index}") + index = await self.loop.run_in_executor(None, partial(self.index_load_file, index_file)) + index_objects.append(index) + + # For each index object, add its documents to a GPTTreeIndex + tree_indexes = [] + for _index in index_objects: + # Get all the document objects out of _index.docstore.docs + document_ids = [docmeta for docmeta in _index.docstore.docs.keys()] + documents = list([_index.docstore.get_document(doc_id) for doc_id in document_ids if isinstance(_index.docstore.get_document(doc_id), Document)]) + tree_index = GPTTreeIndex(documents=documents) + + summary = tree_index.query( + "What is a summary of this document?", mode="summarize" + ) + tree_index.set_text(str(summary)) + tree_indexes.append(tree_index) + + # Now we have a list of tree indexes, we can compose them + list_index = GPTListIndex(tree_indexes) + graph = ComposableGraph.build_from_index(list_index) + + if not name: + name = f"composed_index_{date.today()}-H{datetime.now().hour}.json" + + # Save the composed index + graph.save_to_disk(f"indexes/{user_id}/{name}.json") + + self.index_storage[user_id].queryable_index = graph + + async def backup_discord(self, ctx: discord.ApplicationContext, user_api_key): if not user_api_key: os.environ["OPENAI_API_KEY"] = self.openai_key @@ -235,12 +291,10 @@ class Index_handler: os.environ["OPENAI_API_KEY"] = user_api_key try: - index: [GPTSimpleVectorIndex, ComposableGraph] = self.index_storage[ctx.user.id].get_index_or_throw() - if isinstance(index, GPTSimpleVectorIndex): - response = await self.loop.run_in_executor(None, partial(index.query, query, verbose=True, text_qa_template=self.qaprompt)) - else: - response = await self.loop.run_in_executor(None, - partial(index.query, query, query_configs=[], verbose=True)) + llm_predictor = MockLLMPredictor(max_tokens=256) + response = await self.loop.run_in_executor(None, partial(get_and_query, ctx.user.id, self.index_storage, query, llm_predictor)) + print("The last token usage was ", llm_predictor.last_token_usage) + await self.usage_service.update_usage(llm_predictor.last_token_usage) await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") except Exception: traceback.print_exc() @@ -319,3 +373,70 @@ class Index_handler: Document(channel_content, extra_info={"channel_name": channel_name}) ) return results + + async def compose(self, ctx: discord.ApplicationContext, name, user_api_key): + # Send the ComposeModal + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + if not self.index_storage[ctx.user.id].queryable(): + await ctx.respond("You must load at least two indexes before composing") + return + + await ctx.respond("Select the indexes to compose.", view=ComposeModal(self, ctx.user.id, name)) + + +class ComposeModal(discord.ui.View): + def __init__(self, index_cog, user_id, name=None) -> None: + super().__init__() + # Get the argument named "user_key_db" and save it as USER_KEY_DB + self.index_cog = index_cog + self.user_id = user_id + + # Get all the indexes for the user + self.indexes = [ + file + for file in os.listdir(EnvService.find_shared_file(f"indexes/{str(user_id)}/")) + ] + + # A text entry field for the name of the composed index + self.name = name + + # A discord UI select menu with all the indexes + self.index_select = discord.ui.Select( + placeholder="Select an index", + options=[ + discord.SelectOption(label=index, value=index) + for index in self.indexes + ], + max_values=len(self.indexes), + min_values=1, + + ) + # Add the select menu to the modal + self.add_item(self.index_select) + + # Add a button to the modal called "Compose" + self.add_item(discord.ui.Button(label="Compose", style=discord.ButtonStyle.green, custom_id="compose")) + + # The callback for the button + async def interaction_check(self, interaction: discord.Interaction) -> bool: + # Check that the interaction was for custom_id "compose" + if interaction.data["custom_id"] == "compose": + # Check that the user selected at least one index + if len(self.index_select.values) < 2: + await interaction.response.send_message("You must select at least two indexes") + else: + composing_message = await interaction.response.send_message("Composing indexes, this may take a long time...", ephemeral=True, delete_after=120) + # Compose the indexes + await self.index_cog.compose_indexes(self.user_id,self.index_select.values,self.name) + await interaction.followup.send("Composed indexes", ephemeral=True, delete_after=10) + + try: + await composing_message.delete() + except: + pass + else: + await interaction.response.defer() \ No newline at end of file From fd2ab04d933e928614a52dbb9fb95a28c8532183 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 5 Feb 2023 01:52:43 -0500 Subject: [PATCH 18/23] More composability --- README.md | 12 +++- models/index_model.py | 135 ++++++++++++++++++++++++++++-------------- 2 files changed, 103 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index e4150bc..5068cec 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) # Overview -A robust, all-in-one GPT3 interface for Discord. Chat just like ChatGPT right inside Discord! Generate beautiful AI art using DALL-E 2! Automatically moderate your server using AI! A thorough integration with permanent conversation memory, automatic request retry, fault tolerance and reliability for servers of any scale, and much more. +A robust, all-in-one GPT3 interface for Discord. Chat just like ChatGPT right inside Discord! Generate beautiful AI art using DALL-E 2! Automatically moderate your server using AI! Upload documents, videos, and files to get AI-assisted insights! A thorough integration with permanent conversation memory, automatic request retry, fault tolerance and reliability for servers of any scale, and much more. SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the bot here also in a limited fashion) # Screenshots @@ -25,6 +25,10 @@ SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the # Recent Notable Updates - **CUSTOM INDEXES** - This is a huge update. You can now upload files to your server and use them as custom context when asking GPT3 questions. You can also use webpage links as context, images, full documents, csvs, powerpoints, audio files, and even **youtube videos**! Read more in the 'Custom Indexes' section below. +

+ +

+ # Features - **Directly prompt GPT3 with `/gpt ask `** @@ -181,12 +185,18 @@ Permanent memory using pinecone is still in alpha, I will be working on cleaning # Custom Indexes / Knowledgebase This bot supports per-user custom indexes. This means that users can upload files of their choosing, such as PDFs and ask GPT to answer questions based on those files. We also support using URLs for indexes. +**This feature uses a large amount of tokens and money, and you should restrict it to trusted users.** + Supported filetypes: - All text and data based files (PDF, TXT, DOCX, PPTX, CSV etc) - Images (JPG, PNG, etc) (Note: The bot will do OCR on the images to extract the text, this requires a lot of processing power sometimes) - Videos/Audio (MP4, MP3, etc) (Note: The bot will use OpenAI on the audio to extract the text, this requires a lot of processing power sometimes) - **Youtube Videos** - For all youtube videos that are transcribable, the bot will index the entire transcription of the given youtube video URL! +Index Compositions: +Indexes can be combined with other indexes through a composition. To combine indexes, you can run the `/index compose` command, and select the indexes that you want to combine together. You should only combine relevant indexes together, combining irrelevant indexes together will result in poor results (for example, don't upload a math textbook and then upload a large set of poems and combine them together). When creating a composition, you will be given the option to do a "Deep" composition, deep compositions are more detailed and will give you better results, but are incredibly costly and will sometimes take multiple minutes to compose. + +You can also compose a singular index with itself with "Deep Compose", this will give you a more detailed version of the index, but will be costly and will sometimes take multiple minutes to compose. # Translations with DeepL This bot supports and uses DeepL for translations (optionally). If you want to enable the translations service, you can add a line in your `.env` file as follows: diff --git a/models/index_model.py b/models/index_model.py index 1b4ae62..088ba60 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -28,15 +28,6 @@ def get_and_query(user_id, index_storage, query, llm_predictor): if isinstance(index, GPTSimpleVectorIndex): response = index.query(query,verbose=True) else: - query_configs = [ - QueryConfig( - index_struct_type=IndexStructType.TREE, - query_mode=QueryMode.RECURSIVE, - query_kwargs={ - "child_branch_factor": 4 - } - ) - ] response = index.query(query, verbose=True, query_configs=[]) return response @@ -54,6 +45,12 @@ class IndexData: def queryable(self): return self.queryable_index is not None + def has_indexes(self, user_id): + try: + return len(os.listdir(f"{app_root_path()}/indexes/{user_id}")) > 1 + except: + return False + def add_index(self, index, user_id, file_name): self.individual_indexes.append(index) self.queryable_index = index @@ -61,7 +58,7 @@ class IndexData: # Create a folder called "indexes/{USER_ID}" if it doesn't exist already Path(f"{app_root_path()}/indexes/{user_id}").mkdir(parents=True, exist_ok=True) # Save the index to file under the user id - index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today()}-H{datetime.now().hour}.json") + index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today()}.json") def reset_indexes(self, user_id): self.individual_indexes = [] @@ -227,7 +224,7 @@ class Index_handler: except Exception as e: await ctx.respond(e) - async def compose_indexes(self, user_id, indexes, name): + async def compose_indexes(self, user_id, indexes, name, deep_compose): # Load all the indexes first index_objects = [] for _index in indexes: @@ -236,30 +233,46 @@ class Index_handler: index_objects.append(index) # For each index object, add its documents to a GPTTreeIndex - tree_indexes = [] - for _index in index_objects: - # Get all the document objects out of _index.docstore.docs - document_ids = [docmeta for docmeta in _index.docstore.docs.keys()] - documents = list([_index.docstore.get_document(doc_id) for doc_id in document_ids if isinstance(_index.docstore.get_document(doc_id), Document)]) - tree_index = GPTTreeIndex(documents=documents) - - summary = tree_index.query( - "What is a summary of this document?", mode="summarize" - ) - tree_index.set_text(str(summary)) - tree_indexes.append(tree_index) + if deep_compose: + tree_indexes = [] + for _index in index_objects: + # Get all the document objects out of _index.docstore.docs + document_ids = [docmeta for docmeta in _index.docstore.docs.keys()] + documents = list([_index.docstore.get_document(doc_id) for doc_id in document_ids if isinstance(_index.docstore.get_document(doc_id), Document)]) + tree_index = GPTTreeIndex(documents=documents) + + summary = tree_index.query( + "What is a summary of this document?", mode="summarize" + ) + + tree_index.set_text(str(summary)) + tree_indexes.append(tree_index) + + # Now we have a list of tree indexes, we can compose them + list_index = GPTListIndex(tree_indexes) + graph = ComposableGraph.build_from_index(list_index) - # Now we have a list of tree indexes, we can compose them - list_index = GPTListIndex(tree_indexes) - graph = ComposableGraph.build_from_index(list_index) + if not name: + name = f"composed_deep_index_{date.today()}.json" + + # Save the composed index + graph.save_to_disk(f"indexes/{user_id}/{name}.json") + + self.index_storage[user_id].queryable_index = graph + else: + documents = [] + for _index in index_objects: + [documents.append(_index.docstore.get_document(doc_id)) for doc_id in [docmeta for docmeta in _index.docstore.docs.keys()] if isinstance(_index.docstore.get_document(doc_id), Document)] - if not name: - name = f"composed_index_{date.today()}-H{datetime.now().hour}.json" + # Add everything into a simple vector index + simple_index = GPTSimpleVectorIndex(documents=documents) - # Save the composed index - graph.save_to_disk(f"indexes/{user_id}/{name}.json") + if not name: + name = f"composed_index_{date.today()}.json" - self.index_storage[user_id].queryable_index = graph + # Save the composed index + simple_index.save_to_disk(f"indexes/{user_id}/{name}.json") + self.index_storage[user_id].queryable_index = simple_index async def backup_discord(self, ctx: discord.ApplicationContext, user_api_key): @@ -275,7 +288,7 @@ class Index_handler: document = await self.load_data(channel_ids=channel_ids, limit=3000, oldest_first=False) index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) Path(app_root_path() / "indexes").mkdir(parents = True, exist_ok=True) - index.save_to_disk(app_root_path() / "indexes" / f"{ctx.guild.name.replace(' ', '-')}_{date.today()}-H{datetime.now().hour}.json") + index.save_to_disk(app_root_path() / "indexes" / f"{ctx.guild.name.replace(' ', '-')}_{date.today()}.json") await ctx.respond("Backup saved") except Exception: @@ -381,19 +394,20 @@ class Index_handler: else: os.environ["OPENAI_API_KEY"] = user_api_key - if not self.index_storage[ctx.user.id].queryable(): + if not self.index_storage[ctx.user.id].has_indexes(ctx.user.id): await ctx.respond("You must load at least two indexes before composing") return - await ctx.respond("Select the indexes to compose.", view=ComposeModal(self, ctx.user.id, name)) + await ctx.respond("Select the indexes to compose.", view=ComposeModal(self, ctx.user.id, name), ephemeral=True) class ComposeModal(discord.ui.View): - def __init__(self, index_cog, user_id, name=None) -> None: + def __init__(self, index_cog, user_id, name=None, deep=None) -> None: super().__init__() # Get the argument named "user_key_db" and save it as USER_KEY_DB self.index_cog = index_cog self.user_id = user_id + self.deep = deep # Get all the indexes for the user self.indexes = [ @@ -404,20 +418,51 @@ class ComposeModal(discord.ui.View): # A text entry field for the name of the composed index self.name = name - # A discord UI select menu with all the indexes + # A discord UI select menu with all the indexes. Limited to 25 entries self.index_select = discord.ui.Select( - placeholder="Select an index", + placeholder="Select multiple indexes to query", options=[ discord.SelectOption(label=index, value=index) for index in self.indexes - ], - max_values=len(self.indexes), + ][0:25], + max_values=len(self.indexes) if len(self.indexes) < 25 else 25, min_values=1, ) # Add the select menu to the modal self.add_item(self.index_select) + # If we have more than 25 entries, add more Select fields as neccessary + self.extra_index_selects = [] + if len(self.indexes) > 25: + for i in range(25, len(self.indexes), 25): + self.extra_index_selects.append(discord.ui.Select( + placeholder="Select multiple indexes to query", + options=[ + discord.SelectOption(label=index, value=index) + for index in self.indexes + ][i:i+25], + max_values=len(self.indexes[i:i+25]), + min_values=1, + )) + self.add_item(self.extra_index_selects[-1]) + + + + + # Add an input field for "Deep", a "yes" or "no" option, default no + self.deep_select = discord.ui.Select( + placeholder="Deep Compose", + options=[ + discord.SelectOption(label="Yes", value="yes"), + discord.SelectOption(label="No", value="no") + ], + max_values=1, + min_values=1 + ) + self.add_item(self.deep_select) + + # Add a button to the modal called "Compose" self.add_item(discord.ui.Button(label="Compose", style=discord.ButtonStyle.green, custom_id="compose")) @@ -426,12 +471,16 @@ class ComposeModal(discord.ui.View): # Check that the interaction was for custom_id "compose" if interaction.data["custom_id"] == "compose": # Check that the user selected at least one index - if len(self.index_select.values) < 2: - await interaction.response.send_message("You must select at least two indexes") + + # The total list of indexes is the union of the values of all the select menus + indexes = self.index_select.values + [select.values[0] for select in self.extra_index_selects] + + if len(indexes) < 1: + await interaction.response.send_message("You must select at least 1 index", ephemeral=True) else: composing_message = await interaction.response.send_message("Composing indexes, this may take a long time...", ephemeral=True, delete_after=120) # Compose the indexes - await self.index_cog.compose_indexes(self.user_id,self.index_select.values,self.name) + await self.index_cog.compose_indexes(self.user_id,indexes,self.name, False if not self.deep_select.values or self.deep_select.values[0] == "no" else True) await interaction.followup.send("Composed indexes", ephemeral=True, delete_after=10) try: @@ -439,4 +488,4 @@ class ComposeModal(discord.ui.View): except: pass else: - await interaction.response.defer() \ No newline at end of file + await interaction.response.defer(ephemeral=True) \ No newline at end of file From 319ae3431826d4c4b1af29f74e4a0958f73d18ef Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 5 Feb 2023 03:05:53 -0500 Subject: [PATCH 19/23] composability improvements --- README.md | 2 +- models/index_model.py | 69 +++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 5068cec..80676a7 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ Supported filetypes: Index Compositions: Indexes can be combined with other indexes through a composition. To combine indexes, you can run the `/index compose` command, and select the indexes that you want to combine together. You should only combine relevant indexes together, combining irrelevant indexes together will result in poor results (for example, don't upload a math textbook and then upload a large set of poems and combine them together). When creating a composition, you will be given the option to do a "Deep" composition, deep compositions are more detailed and will give you better results, but are incredibly costly and will sometimes take multiple minutes to compose. -You can also compose a singular index with itself with "Deep Compose", this will give you a more detailed version of the index, but will be costly and will sometimes take multiple minutes to compose. +You can also compose a singular index with itself with "Deep Compose", this will give you a more detailed version of the index, but will be costly and will sometimes take multiple minutes to compose. **Deep compositions are useless for very short documents!** # Translations with DeepL This bot supports and uses DeepL for translations (optionally). If you want to enable the translations service, you can add a line in your `.env` file as follows: diff --git a/models/index_model.py b/models/index_model.py index 088ba60..f6947b2 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -21,15 +21,15 @@ from gpt_index.composability import ComposableGraph from services.environment_service import EnvService, app_root_path +SHORT_TO_LONG_CACHE = {} def get_and_query(user_id, index_storage, query, llm_predictor): # TODO Do prediction here for token usage index: [GPTSimpleVectorIndex, ComposableGraph] = index_storage[user_id].get_index_or_throw() - if isinstance(index, GPTSimpleVectorIndex): - response = index.query(query,verbose=True) + if isinstance(index, GPTTreeIndex): + response = index.query(query, verbose=True, child_branch_factor=2) else: - response = index.query(query, verbose=True, query_configs=[]) - + response = index.query(query,verbose=True) return response class IndexData: @@ -47,7 +47,7 @@ class IndexData: def has_indexes(self, user_id): try: - return len(os.listdir(f"{app_root_path()}/indexes/{user_id}")) > 1 + return len(os.listdir(f"{app_root_path()}/indexes/{user_id}")) > 0 except: return False @@ -58,7 +58,7 @@ class IndexData: # Create a folder called "indexes/{USER_ID}" if it doesn't exist already Path(f"{app_root_path()}/indexes/{user_id}").mkdir(parents=True, exist_ok=True) # Save the index to file under the user id - index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today()}.json") + index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today().month}_{date.today().day}.json") def reset_indexes(self, user_id): self.individual_indexes = [] @@ -108,10 +108,10 @@ class Index_handler: return index def index_load_file(self, file_path) -> [GPTSimpleVectorIndex, ComposableGraph]: - if not "composed" in str(file_path): - index = GPTSimpleVectorIndex.load_from_disk(file_path) + if "composed_deep" in str(file_path): + index = GPTTreeIndex.load_from_disk(file_path) else: - index = ComposableGraph.load_from_disk(file_path) + index = GPTSimpleVectorIndex.load_from_disk(file_path) return index def index_discord(self, document) -> GPTSimpleVectorIndex: @@ -234,31 +234,19 @@ class Index_handler: # For each index object, add its documents to a GPTTreeIndex if deep_compose: - tree_indexes = [] + documents = [] for _index in index_objects: - # Get all the document objects out of _index.docstore.docs - document_ids = [docmeta for docmeta in _index.docstore.docs.keys()] - documents = list([_index.docstore.get_document(doc_id) for doc_id in document_ids if isinstance(_index.docstore.get_document(doc_id), Document)]) - tree_index = GPTTreeIndex(documents=documents) - - summary = tree_index.query( - "What is a summary of this document?", mode="summarize" - ) - - tree_index.set_text(str(summary)) - tree_indexes.append(tree_index) + [documents.append(_index.docstore.get_document(doc_id)) for doc_id in [docmeta for docmeta in _index.docstore.docs.keys()] if isinstance(_index.docstore.get_document(doc_id), Document)] + tree_index = GPTTreeIndex(documents=documents) # Now we have a list of tree indexes, we can compose them - list_index = GPTListIndex(tree_indexes) - graph = ComposableGraph.build_from_index(list_index) - if not name: - name = f"composed_deep_index_{date.today()}.json" + name = f"composed_deep_index_{date.today().month}_{date.today().day}.json" # Save the composed index - graph.save_to_disk(f"indexes/{user_id}/{name}.json") + tree_index.save_to_disk(f"indexes/{user_id}/{name}.json") - self.index_storage[user_id].queryable_index = graph + self.index_storage[user_id].queryable_index = tree_index else: documents = [] for _index in index_objects: @@ -268,7 +256,7 @@ class Index_handler: simple_index = GPTSimpleVectorIndex(documents=documents) if not name: - name = f"composed_index_{date.today()}.json" + name = f"composed_index_{date.today().month}_{date.today().day}.json" # Save the composed index simple_index.save_to_disk(f"indexes/{user_id}/{name}.json") @@ -288,7 +276,7 @@ class Index_handler: document = await self.load_data(channel_ids=channel_ids, limit=3000, oldest_first=False) index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) Path(app_root_path() / "indexes").mkdir(parents = True, exist_ok=True) - index.save_to_disk(app_root_path() / "indexes" / f"{ctx.guild.name.replace(' ', '-')}_{date.today()}.json") + index.save_to_disk(app_root_path() / "indexes" / f"{ctx.guild.name.replace(' ', '-')}_{date.today().month}_{date.today().day}.json") await ctx.respond("Backup saved") except Exception: @@ -395,10 +383,10 @@ class Index_handler: os.environ["OPENAI_API_KEY"] = user_api_key if not self.index_storage[ctx.user.id].has_indexes(ctx.user.id): - await ctx.respond("You must load at least two indexes before composing") + await ctx.respond("You must load at least one indexes before composing") return - await ctx.respond("Select the indexes to compose.", view=ComposeModal(self, ctx.user.id, name), ephemeral=True) + await ctx.respond("Select the index(es) to compose. You can compose multiple indexes together, you can also Deep Compose a single index.", view=ComposeModal(self, ctx.user.id, name), ephemeral=True) class ComposeModal(discord.ui.View): @@ -415,14 +403,20 @@ class ComposeModal(discord.ui.View): for file in os.listdir(EnvService.find_shared_file(f"indexes/{str(user_id)}/")) ] + # Map everything into the short to long cache + for index in self.indexes: + SHORT_TO_LONG_CACHE[index[:99]] = index + # A text entry field for the name of the composed index self.name = name - # A discord UI select menu with all the indexes. Limited to 25 entries + # A discord UI select menu with all the indexes. Limited to 25 entries. For the label field in the SelectOption, + # cut it off at 100 characters to prevent the message from being too long + self.index_select = discord.ui.Select( - placeholder="Select multiple indexes to query", + placeholder="Select index(es) to compose", options=[ - discord.SelectOption(label=index, value=index) + discord.SelectOption(label=str(index)[:99], value=index[:99]) for index in self.indexes ][0:25], max_values=len(self.indexes) if len(self.indexes) < 25 else 25, @@ -437,9 +431,9 @@ class ComposeModal(discord.ui.View): if len(self.indexes) > 25: for i in range(25, len(self.indexes), 25): self.extra_index_selects.append(discord.ui.Select( - placeholder="Select multiple indexes to query", + placeholder="Select index(es) to compose", options=[ - discord.SelectOption(label=index, value=index) + discord.SelectOption(label=index[:99], value=index[:99]) for index in self.indexes ][i:i+25], max_values=len(self.indexes[i:i+25]), @@ -475,6 +469,9 @@ class ComposeModal(discord.ui.View): # The total list of indexes is the union of the values of all the select menus indexes = self.index_select.values + [select.values[0] for select in self.extra_index_selects] + # Remap them from the SHORT_TO_LONG_CACHE + indexes = [SHORT_TO_LONG_CACHE[index] for index in indexes] + if len(indexes) < 1: await interaction.response.send_message("You must select at least 1 index", ephemeral=True) else: From 9a46f5c37ae2d2d223c84621bbd592759b76d610 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 5 Feb 2023 03:09:59 -0500 Subject: [PATCH 20/23] Update readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 80676a7..ca7d943 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,8 @@ This bot supports per-user custom indexes. This means that users can upload file `/index load index:` - Load a previously created index to query +`/index compose` - Combine multiple saved indexes into one, or upgrade existing indexes into Deep Compositions. + `/index reset` - Reset and delete all of your saved indexes `/index add_discord channel:` - Create an add an index based on a discord channel From 093c223fae9f3150aa7257f315cadf30ac049f23 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 5 Feb 2023 03:10:42 -0500 Subject: [PATCH 21/23] bump version --- gpt3discord.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt3discord.py b/gpt3discord.py index 2a8c790..7ac09c6 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -31,7 +31,7 @@ from services.environment_service import EnvService from models.openai_model import Model -__version__ = "9.0.4" +__version__ = "10.0.0" PID_FILE = Path("bot.pid") From 749e660e3712906cbc702a80ffb39007e93ca369 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 5 Feb 2023 03:11:55 -0500 Subject: [PATCH 22/23] update readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index ca7d943..109d046 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the -

From acc29b3e57f2c802dacf7cf7cf047fe32f11f8d6 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 5 Feb 2023 03:16:07 -0500 Subject: [PATCH 23/23] update readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 109d046..71bb218 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,13 @@ SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the +

# Recent Notable Updates -- **CUSTOM INDEXES** - This is a huge update. You can now upload files to your server and use them as custom context when asking GPT3 questions. You can also use webpage links as context, images, full documents, csvs, powerpoints, audio files, and even **youtube videos**! Read more in the 'Custom Indexes' section below. +- **CUSTOM INDEXES** - This is a huge update. You can now upload files to your discord server and use them as a source of knowledge when asking GPT3 questions. You can also use webpage links as context, images, full documents, csvs, powerpoints, audio files, and even **youtube videos**! Read more in the 'Custom Indexes' section below.