diff --git a/.gitignore b/.gitignore index a95b25f..3411cda 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ __pycache__ *.sqlite bot.pid usage.txt -/dalleimages \ No newline at end of file +/dalleimages +/indexes \ No newline at end of file diff --git a/README.md b/README.md index 8054d5c..71bb218 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) # Overview -A robust, all-in-one GPT3 interface for Discord. Chat just like ChatGPT right inside Discord! Generate beautiful AI art using DALL-E 2! Automatically moderate your server using AI! A thorough integration with permanent conversation memory, automatic request retry, fault tolerance and reliability for servers of any scale, and much more. +A robust, all-in-one GPT3 interface for Discord. Chat just like ChatGPT right inside Discord! Generate beautiful AI art using DALL-E 2! Automatically moderate your server using AI! Upload documents, videos, and files to get AI-assisted insights! A thorough integration with permanent conversation memory, automatic request retry, fault tolerance and reliability for servers of any scale, and much more. SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the bot here also in a limited fashion) # Screenshots @@ -18,24 +18,16 @@ SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the - +

# Recent Notable Updates -- **Translations with DeepL** - DeepL integration for translations. `/translate` - - -- **Context menu commands** - Allow people to prompt GPT and DALL-E directly by right clicking a message: -
-
- - -- **AI-BASED SERVER MODERATION** - GPT3Discord now has a built-in AI-based moderation system that can automatically detect and remove toxic messages from your server. This is a great way to keep your server safe and clean, and it's completely automatic and **free**! Check out the commands section to learn how to enable it! - - -- **Permanent memory with embeddings and Pinecone finished!** - An initial alpha version of permanent memory is now done! This allows you to chat with GPT3 infinitely and accurately, and save tokens, by using embeddings. *Please read the Permanent Memory section for more information!* +- **CUSTOM INDEXES** - This is a huge update. You can now upload files to your discord server and use them as a source of knowledge when asking GPT3 questions. You can also use webpage links as context, images, full documents, csvs, powerpoints, audio files, and even **youtube videos**! Read more in the 'Custom Indexes' section below. +

+ +

# Features @@ -43,6 +35,8 @@ SUPPORT SERVER FOR BOT SETUP: https://discord.gg/WvAHXDMS7Q (You can try out the - **Have long term, permanent conversations with the bot, just like chatgpt, with `/gpt converse`** - Conversations happen in threads that get automatically cleaned up! +- **Custom Indexes** - Use your own files, pdfs, txt files, websites, discord channel content as context when asking GPT3 questions! + - **DALL-E Image Generation** - Generate DALL-E AI images right in discord with `/dalle draw `! It even supports multiple image qualities, multiple images, creating image variants, retrying, and saving images. - **DALL-E Image Prompt Optimization** - Given some text that you're trying to generate an image for, the bot will automatically optimize the text to be more DALL-E friendly! `/dalle optimize ` @@ -105,6 +99,25 @@ These commands are grouped, so each group has a prefix but you can easily tab co `/dalle optimize ` Optimize a given prompt text for DALL-E image generation. +### Custom Indexes Commands + +This bot supports per-user custom indexes. This means that users can upload files of their choosing, such as PDFs and ask GPT to answer questions based on those files. + +`/index add file: or link:` - Use a document or use a link to create/add to your indexes. If you provide a youtube link, the transcript of the video will be used. If you provide a web url, the contents of the webpage will be used, if you provide an image, the image text will be extracted and used! + +`/index query query:` - Query your current index for a given prompt. GPT will answer based on your current document/indedx + +`/index load index:` - Load a previously created index to query + +`/index compose` - Combine multiple saved indexes into one, or upgrade existing indexes into Deep Compositions. + +`/index reset` - Reset and delete all of your saved indexes + +`/index add_discord channel:` - Create an add an index based on a discord channel + +`/index discord_backup` - Use the last 3000 messages of every channel on your discord server as an index + + ### System and Settings `/system settings` - Display settings for the model (temperature, top_p, etc) @@ -171,18 +184,21 @@ Then, name the index `conversation-embeddings`, set the dimensions to `1536`, an Permanent memory using pinecone is still in alpha, I will be working on cleaning up this work, adding auto-clearing, and optimizing for stability and reliability, any help and feedback is appreciated (**add me on Discord Kaveen#0001 for pinecone help**)! If at any time you're having too many issues with pinecone, simply remove the `PINECONE_TOKEN` line in your `.env` file and the bot will revert to using conversation summarizations. -### Permanent overrides in threads -This bot now supports having overrides be permanent in an entire conversation if you use an opener file which includes them. The new opener files should be .json files formatted like this. `text` corresponds to what you want the conversational opener to be and the rest map 1:1 to the appropriate model settings. An example .json file is included by the name of `english_translator.json` in the `openers` folder -```json -{ - "text": "your prompt", - "temp":0, - "top_p":0, - "frequency_penalty":0, - "presence_penalty":0 -} -``` +# Custom Indexes / Knowledgebase +This bot supports per-user custom indexes. This means that users can upload files of their choosing, such as PDFs and ask GPT to answer questions based on those files. We also support using URLs for indexes. +**This feature uses a large amount of tokens and money, and you should restrict it to trusted users.** + +Supported filetypes: +- All text and data based files (PDF, TXT, DOCX, PPTX, CSV etc) +- Images (JPG, PNG, etc) (Note: The bot will do OCR on the images to extract the text, this requires a lot of processing power sometimes) +- Videos/Audio (MP4, MP3, etc) (Note: The bot will use OpenAI on the audio to extract the text, this requires a lot of processing power sometimes) +- **Youtube Videos** - For all youtube videos that are transcribable, the bot will index the entire transcription of the given youtube video URL! + +Index Compositions: +Indexes can be combined with other indexes through a composition. To combine indexes, you can run the `/index compose` command, and select the indexes that you want to combine together. You should only combine relevant indexes together, combining irrelevant indexes together will result in poor results (for example, don't upload a math textbook and then upload a large set of poems and combine them together). When creating a composition, you will be given the option to do a "Deep" composition, deep compositions are more detailed and will give you better results, but are incredibly costly and will sometimes take multiple minutes to compose. + +You can also compose a singular index with itself with "Deep Compose", this will give you a more detailed version of the index, but will be costly and will sometimes take multiple minutes to compose. **Deep compositions are useless for very short documents!** # Translations with DeepL This bot supports and uses DeepL for translations (optionally). If you want to enable the translations service, you can add a line in your `.env` file as follows: @@ -240,6 +256,15 @@ For example, if I wanted to change the number of images generated by DALL-E by d # Requirements +**For OCR, and document functionalities**: +`pip3 install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html` +or +`python3.9 -m pip install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html` + +**For audio extraction for indexing from .mp3 and .mp4 files**: +`python3.9 -m pip install git+https://github.com/openai/whisper.git` + +**All other dependencies**: `python3.9 -m pip install -r requirements.txt` **I recommend using python 3.9!** @@ -321,6 +346,7 @@ python3.9 get-pip.py # Install project dependencies python3.9 -m pip install --ignore-installed PyYAML +python3.9 -m pip install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html python3.9 -m pip install -r requirements.txt python3.9 -m pip install . @@ -371,12 +397,24 @@ git pull python3.9 -m pip install -r requirements.txt python3.9 -m pip install . ``` +### Permanent overrides in threads +This bot now supports having overrides be permanent in an entire conversation if you use an opener file which includes them. The new opener files should be .json files formatted like this. `text` corresponds to what you want the conversational opener to be and the rest map 1:1 to the appropriate model settings. An example .json file is included by the name of `english_translator.json` in the `openers` folder +```json +{ + "text": "your prompt", + "temp":0, + "top_p":0, + "frequency_penalty":0, + "presence_penalty":0 +} +``` # Non-Server, Non-Docker usage With python3.9 installed and the requirements installed, you can run this bot anywhere. Install the dependencies with: +`pip3 install torch==1.9.1+cpu torchvision==0.10.1+cpu -f https://download.pytorch.org/whl/torch_stable.html` `python3.9 -m pip install -r requirements.txt` Then, run the bot with: diff --git a/cogs/commands.py b/cogs/commands.py index ebc7dc1..e2e7288 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -26,6 +26,7 @@ class Commands(discord.Cog, name="Commands"): image_draw_cog, image_service_cog, moderations_cog, + index_cog, translations_cog=None, search_cog=None, ): @@ -39,6 +40,7 @@ class Commands(discord.Cog, name="Commands"): self.image_draw_cog = image_draw_cog self.image_service_cog = image_service_cog self.moderations_cog = moderations_cog + self.index_cog = index_cog self.translations_cog = translations_cog self.search_cog = search_cog @@ -67,6 +69,12 @@ class Commands(discord.Cog, name="Commands"): guild_ids=ALLOWED_GUILDS, checks=[Check.check_admin_roles()], ) + index = discord.SlashCommandGroup( + name="index", + description="Custom index commands for the bot", + guild_ids=ALLOWED_GUILDS, + checks=[Check.check_index_roles()], + ) # # System commands @@ -512,6 +520,98 @@ class Commands(discord.Cog, name="Commands"): async def end(self, ctx: discord.ApplicationContext): await self.converser_cog.end_command(ctx) + # + # Index commands + # + + @add_to_group("index") + @discord.slash_command( + name="load", + description="Select one of your saved indexes to query from", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="index", description="Which file to load the index from", required=True, autocomplete=File_autocompleter.get_indexes) + async def load_index(self, ctx:discord.ApplicationContext, index: str): + await self.index_cog.load_index_command(ctx, index) + + + @add_to_group("index") + @discord.slash_command( + name="add", + description="Add an index to query from", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="file", description="A file to create the index from", required=False, input_type=discord.SlashCommandOptionType.attachment) + @discord.option(name="link", description="A link to a file to a webpage ", required=False, input_type=str) + async def set_file(self, ctx:discord.ApplicationContext, file: discord.Attachment, link: str): + await self.index_cog.set_index_command(ctx, file, link) + + @add_to_group("index") + @discord.slash_command( + name="reset", + description="Reset (delete) all of your saved indexes", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + async def reset(self, ctx:discord.ApplicationContext): + await self.index_cog.reset_command(ctx) + + @add_to_group("index") + @discord.slash_command( + name="compose", + description="Combine multiple indexes together", + guild_ids=ALLOWED_GUILDS + ) + @discord.option(name="name", description="The name of the new index", required=False, input_type=discord.SlashCommandOptionType.string) + @discord.guild_only() + async def compose(self, ctx:discord.ApplicationContext, name : str): + await self.index_cog.compose_command(ctx,name) + + @add_to_group("index") + @discord.slash_command( + name="add_discord", + description="Set a index from a discord channel", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="channel", description="A channel to create the index from", required=False, input_type=discord.SlashCommandOptionType.channel) + async def set_discord(self, ctx:discord.ApplicationContext, channel: discord.TextChannel): + await self.index_cog.set_discord_command(ctx, channel) + + + @add_to_group("index") + @discord.slash_command( + name="discord_backup", + description="Save an index made from the whole server", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + async def discord_backup(self, ctx:discord.ApplicationContext): + await self.index_cog.discord_backup_command(ctx) + + + @add_to_group("index") + @discord.slash_command( + name="query", + description="Query from your index", + guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option(name="query", description="What to query the index", required=True) + @discord.option( + name="response_mode", + description="Response mode", + guild_ids=ALLOWED_GUILDS, + required=False, + default="default", + choices=["default", "compact", "tree_summarize"] + ) + async def query(self, ctx:discord.ApplicationContext, query: str, response_mode: str): + await self.index_cog.query_command(ctx, query, response_mode) + + # # DALLE commands # diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py new file mode 100644 index 0000000..b7fd70a --- /dev/null +++ b/cogs/index_service_cog.py @@ -0,0 +1,111 @@ +import traceback + +import discord + +from services.environment_service import EnvService +from services.text_service import TextService +from models.index_model import Index_handler + +USER_INPUT_API_KEYS = EnvService.get_user_input_api_keys() +USER_KEY_DB = EnvService.get_api_db() + +class IndexService(discord.Cog, name="IndexService"): + """Cog containing gpt-index commands""" + def __init__( + self, + bot, + usage_service, + ): + super().__init__() + self.bot = bot + self.index_handler = Index_handler(bot, usage_service) + + async def set_index_command(self, ctx, file: discord.Attachment = None, link: str = None): + """Command handler to set a file as your personal index""" + if not file and not link: + await ctx.respond("Please provide a file or a link") + return + + if file and link: + await ctx.respond("Please provide only one file or link. Only one or the other.") + return + + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + if file: + await self.index_handler.set_file_index(ctx, file, user_api_key=user_api_key) + elif link: + await self.index_handler.set_link_index(ctx, link, user_api_key=user_api_key) + + + async def set_discord_command(self, ctx, channel: discord.TextChannel = None): + """Command handler to set a channel as your personal index""" + + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + await self.index_handler.set_discord_index(ctx, channel, user_api_key=user_api_key) + + async def reset_command(self, ctx): + await ctx.defer(ephemeral=True) + try: + self.index_handler.reset_indexes(ctx.user.id) + await ctx.respond("Your indexes have been reset") + except: + traceback.print_exc() + await ctx.respond("Something went wrong while resetting your indexes. Contact the server admin.") + + async def discord_backup_command(self, ctx): + """Command handler to backup the entire server""" + + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + await self.index_handler.backup_discord(ctx, user_api_key=user_api_key) + + + async def load_index_command(self, ctx, index): + """Command handler to backup the entire server""" + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer(ephemeral=True) + await self.index_handler.load_index(ctx, index, user_api_key) + + + async def query_command(self, ctx, query, response_mode): + """Command handler to query your index""" + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await ctx.defer() + await self.index_handler.query(ctx, query, response_mode, user_api_key) + + async def compose_command(self, ctx, name): + """Command handler to compose from your index""" + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key(ctx.user.id, ctx, USER_KEY_DB) + if not user_api_key: + return + + await self.index_handler.compose(ctx, name, user_api_key) diff --git a/gpt3discord.py b/gpt3discord.py index 9db8afb..7ac09c6 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -18,6 +18,7 @@ from cogs.prompt_optimizer_cog import ImgPromptOptimizer from cogs.moderations_service_cog import ModerationsService from cogs.commands import Commands from cogs.translation_service_cog import TranslationService +from cogs.index_service_cog import IndexService from models.deepl_model import TranslationModel from services.health_service import HealthService @@ -30,7 +31,7 @@ from services.environment_service import EnvService from models.openai_model import Model -__version__ = "9.1" +__version__ = "10.0.0" PID_FILE = Path("bot.pid") @@ -68,7 +69,7 @@ if PINECONE_TOKEN: and EnvService.get_google_search_engine_id() ): if PINECONE_INDEX_SEARCH not in pinecone.list_indexes(): - print("Creating pinecone index for seraches. Please wait...") + print("Creating pinecone index for searches. Please wait...") pinecone.create_index( PINECONE_INDEX_SEARCH, dimension=1536, @@ -169,6 +170,13 @@ async def main(): ) ) + bot.add_cog( + IndexService( + bot, + usage_service, + ) + ) + if EnvService.get_deepl_token(): bot.add_cog(TranslationService(bot, TranslationModel())) print("The translation service is enabled.") @@ -191,6 +199,7 @@ async def main(): bot.get_cog("DrawDallEService"), bot.get_cog("ImgPromptOptimizer"), bot.get_cog("ModerationsService"), + bot.get_cog("IndexService"), bot.get_cog("TranslationService"), bot.get_cog("SearchService"), ) diff --git a/models/autocomplete_model.py b/models/autocomplete_model.py index 017e8c0..6ace513 100644 --- a/models/autocomplete_model.py +++ b/models/autocomplete_model.py @@ -149,3 +149,16 @@ class File_autocompleter: ] # returns the 25 first files from your current input except Exception: return ["No 'openers' folder"] + + async def get_indexes(ctx: discord.AutocompleteContext): + """get all files in the openers folder""" + try: + return [ + file + for file in os.listdir(EnvService.find_shared_file(f"indexes/{str(ctx.interaction.user.id)}/")) + if file.startswith(ctx.value.lower()) + ][ + :25 + ] # returns the 25 first files from your current input + except Exception: + return ["No 'indexes' folder"] diff --git a/models/check_model.py b/models/check_model.py index b7ebd77..de63e3e 100644 --- a/models/check_model.py +++ b/models/check_model.py @@ -6,6 +6,7 @@ from typing import Callable ADMIN_ROLES = EnvService.get_admin_roles() DALLE_ROLES = EnvService.get_dalle_roles() GPT_ROLES = EnvService.get_gpt_roles() +INDEX_ROLES = EnvService.get_index_roles() TRANSLATOR_ROLES = EnvService.get_translator_roles() ALLOWED_GUILDS = EnvService.get_allowed_guilds() @@ -63,6 +64,23 @@ class Check: return inner + @staticmethod + def check_index_roles() -> Callable: + async def inner(ctx: discord.ApplicationContext): + if INDEX_ROLES == [None]: + return True + if not any(role.name.lower() in INDEX_ROLES for role in ctx.user.roles): + await ctx.defer(ephemeral=True) + await ctx.respond( + f"You don't have permission, list of roles is {INDEX_ROLES}", + ephemeral=True, + delete_after=10, + ) + return False + return True + + return inner + @staticmethod def check_translator_roles() -> Callable: async def inner(ctx: discord.ApplicationContext): diff --git a/models/index_model.py b/models/index_model.py new file mode 100644 index 0000000..f6947b2 --- /dev/null +++ b/models/index_model.py @@ -0,0 +1,488 @@ +import os +import traceback +import asyncio +from collections import defaultdict + +import discord +import aiofiles +from functools import partial +from typing import List, Optional +from pathlib import Path +from datetime import date, datetime + +from gpt_index.readers import YoutubeTranscriptReader +from gpt_index.readers.schema.base import Document +from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, BeautifulSoupWebReader, \ + GPTFaissIndex, GPTListIndex, QueryMode, GPTTreeIndex, GoogleDocsReader, MockLLMPredictor, QueryConfig, \ + IndexStructType +from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR + +from gpt_index.composability import ComposableGraph + +from services.environment_service import EnvService, app_root_path + +SHORT_TO_LONG_CACHE = {} + +def get_and_query(user_id, index_storage, query, llm_predictor): + # TODO Do prediction here for token usage + index: [GPTSimpleVectorIndex, ComposableGraph] = index_storage[user_id].get_index_or_throw() + if isinstance(index, GPTTreeIndex): + response = index.query(query, verbose=True, child_branch_factor=2) + else: + response = index.query(query,verbose=True) + return response + +class IndexData: + def __init__(self): + self.queryable_index = None + self.individual_indexes = [] + + # A safety check for the future + def get_index_or_throw(self): + if not self.queryable(): + raise Exception("An index access was attempted before an index was created. This is a programmer error, please report this to the maintainers.") + return self.queryable_index + def queryable(self): + return self.queryable_index is not None + + def has_indexes(self, user_id): + try: + return len(os.listdir(f"{app_root_path()}/indexes/{user_id}")) > 0 + except: + return False + + def add_index(self, index, user_id, file_name): + self.individual_indexes.append(index) + self.queryable_index = index + + # Create a folder called "indexes/{USER_ID}" if it doesn't exist already + Path(f"{app_root_path()}/indexes/{user_id}").mkdir(parents=True, exist_ok=True) + # Save the index to file under the user id + index.save_to_disk(app_root_path() / "indexes" / f"{str(user_id)}"/f"{file_name}_{date.today().month}_{date.today().day}.json") + + def reset_indexes(self, user_id): + self.individual_indexes = [] + self.queryable_index = None + + # Delete the user indexes + try: + # First, clear all the files inside it + for file in os.listdir(f"{app_root_path()}/indexes/{user_id}"): + os.remove(f"{app_root_path()}/indexes/{user_id}/{file}") + + except: + traceback.print_exc() + pass + +class Index_handler: + def __init__(self, bot, usage_service): + self.bot = bot + self.openai_key = os.getenv("OPENAI_TOKEN") + self.index_storage = defaultdict(IndexData) + self.loop = asyncio.get_running_loop() + self.usage_service = usage_service + self.qaprompt = QuestionAnswerPrompt( + "Context information is below. The text '<|endofstatement|>' is used to separate chat entries and make it easier for you to understand the context\n" + "---------------------\n" + "{context_str}" + "\n---------------------\n" + "Never say '<|endofstatement|>'\n" + "Given the context information and not prior knowledge, " + "answer the question: {query_str}\n" + ) + + # TODO We need to do predictions below for token usage. + def index_file(self, file_path) -> GPTSimpleVectorIndex: + document = SimpleDirectoryReader(file_path).load_data() + index = GPTSimpleVectorIndex(document) + return index + + def index_gdoc(self, doc_id) -> GPTSimpleVectorIndex: + document = GoogleDocsReader().load_data(doc_id) + index = GPTSimpleVectorIndex(document) + return index + + def index_youtube_transcript(self, link): + documents = YoutubeTranscriptReader().load_data(ytlinks=[link]) + index = GPTSimpleVectorIndex(documents,) + return index + + def index_load_file(self, file_path) -> [GPTSimpleVectorIndex, ComposableGraph]: + if "composed_deep" in str(file_path): + index = GPTTreeIndex.load_from_disk(file_path) + else: + index = GPTSimpleVectorIndex.load_from_disk(file_path) + return index + + def index_discord(self, document) -> GPTSimpleVectorIndex: + index = GPTSimpleVectorIndex(document,) + return index + + def index_webpage(self, url) -> GPTSimpleVectorIndex: + documents = BeautifulSoupWebReader(website_extractor=DEFAULT_WEBSITE_EXTRACTOR).load_data(urls=[url]) + index = GPTSimpleVectorIndex(documents) + return index + + def reset_indexes(self, user_id): + self.index_storage[user_id].reset_indexes(user_id) + + async def set_file_index(self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + print(file.content_type) + if file.content_type.startswith("text/plain"): + suffix = ".txt" + elif file.content_type.startswith("application/pdf"): + suffix = ".pdf" + # Allow for images too + elif file.content_type.startswith("image/png"): + suffix = ".png" + elif file.content_type.startswith("image/"): + suffix = ".jpg" + elif "csv" in file.content_type: + suffix = ".csv" + elif "vnd." in file.content_type: + suffix = ".pptx" + # Catch all audio files and suffix with "mp3" + elif file.content_type.startswith("audio/"): + suffix = ".mp3" + # Catch video files + elif file.content_type.startswith("video/"): + pass # No suffix change + else: + await ctx.respond("Only accepts text, pdf, images, spreadheets, powerpoint, and audio/video files.") + return + async with aiofiles.tempfile.TemporaryDirectory() as temp_path: + async with aiofiles.tempfile.NamedTemporaryFile(suffix=suffix, dir=temp_path, delete=False) as temp_file: + await file.save(temp_file.name) + index = await self.loop.run_in_executor(None, partial(self.index_file, temp_path)) + + file_name = file.filename + self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) + await ctx.respond("Index added to your indexes.") + except Exception: + await ctx.respond("Failed to set index") + traceback.print_exc() + + async def set_link_index(self, ctx: discord.ApplicationContext, link: str, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + # TODO Link validation + try: + # Check if the link contains youtube in it + if "youtube" in link: + index = await self.loop.run_in_executor(None, partial(self.index_youtube_transcript, link)) + else: + index = await self.loop.run_in_executor(None, partial(self.index_webpage, link)) + + # Make the url look nice, remove https, useless stuff, random characters + file_name = link.replace("https://", "").replace("http://", "").replace("www.", "").replace("/", "_").replace("?", "_").replace("&", "_").replace("=", "_").replace("-", "_").replace(".", "_") + + self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) + + except Exception: + await ctx.respond("Failed to set index") + traceback.print_exc() + + await ctx.respond("Index set") + + + async def set_discord_index(self, ctx: discord.ApplicationContext, channel: discord.TextChannel, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + document = await self.load_data(channel_ids=[channel.id], limit=1000, oldest_first=False) + index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) + self.index_storage[ctx.user.id].add_index(index, ctx.user.id, channel.name) + await ctx.respond("Index set") + except Exception: + await ctx.respond("Failed to set index") + traceback.print_exc() + + + async def load_index(self, ctx:discord.ApplicationContext, index, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + index_file = EnvService.find_shared_file(f"indexes/{ctx.user.id}/{index}") + index = await self.loop.run_in_executor(None, partial(self.index_load_file, index_file)) + self.index_storage[ctx.user.id].queryable_index = index + await ctx.respond("Loaded index") + except Exception as e: + await ctx.respond(e) + + async def compose_indexes(self, user_id, indexes, name, deep_compose): + # Load all the indexes first + index_objects = [] + for _index in indexes: + index_file = EnvService.find_shared_file(f"indexes/{user_id}/{_index}") + index = await self.loop.run_in_executor(None, partial(self.index_load_file, index_file)) + index_objects.append(index) + + # For each index object, add its documents to a GPTTreeIndex + if deep_compose: + documents = [] + for _index in index_objects: + [documents.append(_index.docstore.get_document(doc_id)) for doc_id in [docmeta for docmeta in _index.docstore.docs.keys()] if isinstance(_index.docstore.get_document(doc_id), Document)] + tree_index = GPTTreeIndex(documents=documents) + + # Now we have a list of tree indexes, we can compose them + if not name: + name = f"composed_deep_index_{date.today().month}_{date.today().day}.json" + + # Save the composed index + tree_index.save_to_disk(f"indexes/{user_id}/{name}.json") + + self.index_storage[user_id].queryable_index = tree_index + else: + documents = [] + for _index in index_objects: + [documents.append(_index.docstore.get_document(doc_id)) for doc_id in [docmeta for docmeta in _index.docstore.docs.keys()] if isinstance(_index.docstore.get_document(doc_id), Document)] + + # Add everything into a simple vector index + simple_index = GPTSimpleVectorIndex(documents=documents) + + if not name: + name = f"composed_index_{date.today().month}_{date.today().day}.json" + + # Save the composed index + simple_index.save_to_disk(f"indexes/{user_id}/{name}.json") + self.index_storage[user_id].queryable_index = simple_index + + + async def backup_discord(self, ctx: discord.ApplicationContext, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + channel_ids:List[int] = [] + for c in ctx.guild.text_channels: + channel_ids.append(c.id) + document = await self.load_data(channel_ids=channel_ids, limit=3000, oldest_first=False) + index = await self.loop.run_in_executor(None, partial(self.index_discord, document)) + Path(app_root_path() / "indexes").mkdir(parents = True, exist_ok=True) + index.save_to_disk(app_root_path() / "indexes" / f"{ctx.guild.name.replace(' ', '-')}_{date.today().month}_{date.today().day}.json") + + await ctx.respond("Backup saved") + except Exception: + await ctx.respond("Failed to save backup") + traceback.print_exc() + + + + async def query(self, ctx: discord.ApplicationContext, query:str, response_mode, user_api_key): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + try: + llm_predictor = MockLLMPredictor(max_tokens=256) + response = await self.loop.run_in_executor(None, partial(get_and_query, ctx.user.id, self.index_storage, query, llm_predictor)) + print("The last token usage was ", llm_predictor.last_token_usage) + await self.usage_service.update_usage(llm_predictor.last_token_usage) + await ctx.respond(f"**Query:**\n\n{query.strip()}\n\n**Query response:**\n\n{response.response.strip()}") + except Exception: + traceback.print_exc() + await ctx.respond("Failed to send query. You may not have an index set, load an index with /index load", delete_after=10) + + # Extracted functions from DiscordReader + + async def read_channel(self, channel_id: int, limit: Optional[int], oldest_first: bool) -> str: + """Async read channel.""" + + messages: List[discord.Message] = [] + + + try: + channel = self.bot.get_channel(channel_id) + print(f"Added {channel.name} from {channel.guild.name}") + # only work for text channels for now + if not isinstance(channel, discord.TextChannel): + raise ValueError( + f"Channel {channel_id} is not a text channel. " + "Only text channels are supported for now." + ) + # thread_dict maps thread_id to thread + thread_dict = {} + for thread in channel.threads: + thread_dict[thread.id] = thread + + async for msg in channel.history( + limit=limit, oldest_first=oldest_first + ): + if msg.author.bot: + pass + else: + messages.append(msg) + if msg.id in thread_dict: + thread = thread_dict[msg.id] + async for thread_msg in thread.history( + limit=limit, oldest_first=oldest_first + ): + messages.append(thread_msg) + except Exception as e: + print("Encountered error: " + str(e)) + + channel = self.bot.get_channel(channel_id) + msg_txt_list = [f"user:{m.author.display_name}, content:{m.content}" for m in messages] + + return ("<|endofstatement|>\n\n".join(msg_txt_list), channel.name) + + async def load_data( + self, + channel_ids: List[int], + limit: Optional[int] = None, + oldest_first: bool = True, + ) -> List[Document]: + """Load data from the input directory. + + Args: + channel_ids (List[int]): List of channel ids to read. + limit (Optional[int]): Maximum number of messages to read. + oldest_first (bool): Whether to read oldest messages first. + Defaults to `True`. + + Returns: + List[Document]: List of documents. + + """ + results: List[Document] = [] + for channel_id in channel_ids: + if not isinstance(channel_id, int): + raise ValueError( + f"Channel id {channel_id} must be an integer, " + f"not {type(channel_id)}." + ) + (channel_content, channel_name) = await self.read_channel(channel_id, limit=limit, oldest_first=oldest_first) + results.append( + Document(channel_content, extra_info={"channel_name": channel_name}) + ) + return results + + async def compose(self, ctx: discord.ApplicationContext, name, user_api_key): + # Send the ComposeModal + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + if not self.index_storage[ctx.user.id].has_indexes(ctx.user.id): + await ctx.respond("You must load at least one indexes before composing") + return + + await ctx.respond("Select the index(es) to compose. You can compose multiple indexes together, you can also Deep Compose a single index.", view=ComposeModal(self, ctx.user.id, name), ephemeral=True) + + +class ComposeModal(discord.ui.View): + def __init__(self, index_cog, user_id, name=None, deep=None) -> None: + super().__init__() + # Get the argument named "user_key_db" and save it as USER_KEY_DB + self.index_cog = index_cog + self.user_id = user_id + self.deep = deep + + # Get all the indexes for the user + self.indexes = [ + file + for file in os.listdir(EnvService.find_shared_file(f"indexes/{str(user_id)}/")) + ] + + # Map everything into the short to long cache + for index in self.indexes: + SHORT_TO_LONG_CACHE[index[:99]] = index + + # A text entry field for the name of the composed index + self.name = name + + # A discord UI select menu with all the indexes. Limited to 25 entries. For the label field in the SelectOption, + # cut it off at 100 characters to prevent the message from being too long + + self.index_select = discord.ui.Select( + placeholder="Select index(es) to compose", + options=[ + discord.SelectOption(label=str(index)[:99], value=index[:99]) + for index in self.indexes + ][0:25], + max_values=len(self.indexes) if len(self.indexes) < 25 else 25, + min_values=1, + + ) + # Add the select menu to the modal + self.add_item(self.index_select) + + # If we have more than 25 entries, add more Select fields as neccessary + self.extra_index_selects = [] + if len(self.indexes) > 25: + for i in range(25, len(self.indexes), 25): + self.extra_index_selects.append(discord.ui.Select( + placeholder="Select index(es) to compose", + options=[ + discord.SelectOption(label=index[:99], value=index[:99]) + for index in self.indexes + ][i:i+25], + max_values=len(self.indexes[i:i+25]), + min_values=1, + )) + self.add_item(self.extra_index_selects[-1]) + + + + + # Add an input field for "Deep", a "yes" or "no" option, default no + self.deep_select = discord.ui.Select( + placeholder="Deep Compose", + options=[ + discord.SelectOption(label="Yes", value="yes"), + discord.SelectOption(label="No", value="no") + ], + max_values=1, + min_values=1 + ) + self.add_item(self.deep_select) + + + # Add a button to the modal called "Compose" + self.add_item(discord.ui.Button(label="Compose", style=discord.ButtonStyle.green, custom_id="compose")) + + # The callback for the button + async def interaction_check(self, interaction: discord.Interaction) -> bool: + # Check that the interaction was for custom_id "compose" + if interaction.data["custom_id"] == "compose": + # Check that the user selected at least one index + + # The total list of indexes is the union of the values of all the select menus + indexes = self.index_select.values + [select.values[0] for select in self.extra_index_selects] + + # Remap them from the SHORT_TO_LONG_CACHE + indexes = [SHORT_TO_LONG_CACHE[index] for index in indexes] + + if len(indexes) < 1: + await interaction.response.send_message("You must select at least 1 index", ephemeral=True) + else: + composing_message = await interaction.response.send_message("Composing indexes, this may take a long time...", ephemeral=True, delete_after=120) + # Compose the indexes + await self.index_cog.compose_indexes(self.user_id,indexes,self.name, False if not self.deep_select.values or self.deep_select.values[0] == "no" else True) + await interaction.followup.send("Composed indexes", ephemeral=True, delete_after=10) + + try: + await composing_message.delete() + except: + pass + else: + await interaction.response.defer(ephemeral=True) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a85a823..04bc610 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,15 @@ dependencies = [ "flask", "flask", "beautifulsoup4", + "gpt-index", + "PyPDF2", + "youtube_transcript_api", + "sentence-transformers", + "sentencepiece", + "protobuf", + "python-pptx", ] + dynamic = ["version"] [project.scripts] gpt3discord = "gpt3discord:init" diff --git a/requirements.txt b/requirements.txt index 6a7339c..d0ac6f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,11 @@ pinecone-client==2.1.0 sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.2 -beautifulsoup4==4.11.1 \ No newline at end of file +beautifulsoup4==4.11.1 +gpt-index==0.3.4 +PyPDF2==3.0.1 +youtube_transcript_api==0.5.0 +sentencepiece==0.1.97 +sentence-transformers==2.2.2 +protobuf==3.20.0 +python-pptx==0.6.21 \ No newline at end of file diff --git a/services/environment_service.py b/services/environment_service.py index ad098fd..e60a315 100644 --- a/services/environment_service.py +++ b/services/environment_service.py @@ -191,6 +191,32 @@ class EnvService: ) return gpt_roles + @staticmethod + def get_index_roles(): + # GPT_ROLES is a comma separated list of string roles + # It can also just be one role + # Read these allowed roles and return as a list of strings + try: + index_roles = os.getenv("INDEX_ROLES") + except Exception: + index_roles = None + + if index_roles is None: + print( + "INDEX_ROLES is not defined properly in the environment file!" + "Please copy your server's role and put it into INDEX_ROLES in the .env file." + 'For example a line should look like: `INDEX_ROLES="Gpt"`' + ) + print("Defaulting to allowing all users to use Index commands...") + return [None] + + index_roles = ( + index_roles.lower().strip().split(",") + if "," in index_roles + else [index_roles.lower()] + ) + return index_roles + @staticmethod def get_welcome_message(): # WELCOME_MESSAGE is a default string used to welcome new members to the server if GPT3 is not available.