From 6941cc30b9ab9a0fa92f7c8579759149969b4643 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Thu, 2 Mar 2023 12:37:25 -0500 Subject: [PATCH] add some more gpt index readers --- cogs/commands.py | 24 +++++ cogs/index_service_cog.py | 22 +++++ models/index_model.py | 192 ++++++++++++++++++++++++++++++-------- models/search_model.py | 10 +- pyproject.toml | 2 +- requirements.txt | 3 +- requirements_base.txt | 3 +- 7 files changed, 211 insertions(+), 45 deletions(-) diff --git a/cogs/commands.py b/cogs/commands.py index f62796c..99e50e5 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -682,6 +682,30 @@ class Commands(discord.Cog, name="Commands"): ): await self.index_cog.set_index_command(ctx, file, link) + @add_to_group("index") + @discord.slash_command( + name="recurse-link", description="Recursively index a link", guild_ids=ALLOWED_GUILDS + ) + @discord.guild_only() + @discord.option( + name="link", + description="A link to create the index from", + required=True, + input_type=discord.SlashCommandOptionType.string, + ) + @discord.option( + name="depth", + description="How deep to recurse", + required=False, + input_type=discord.SlashCommandOptionType.integer, + min_value=1, + max_value=5, + ) + async def set_recurse_link( + self, ctx: discord.ApplicationContext, link: str, depth: int + ): + await self.index_cog.set_index_link_recurse_command(ctx, link, depth) + @add_to_group("index") @discord.slash_command( name="reset", diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index e3a35f2..414b9fb 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -120,6 +120,28 @@ class IndexService(discord.Cog, name="IndexService"): ) ) + async def set_index_link_recurse_command( + self, ctx, link: str = None, depth:int = 1 + ): + await ctx.defer() + """Command handler to set a file as your personal index""" + if not link: + await ctx.respond("Please provide a link") + return + + + user_api_key = None + if USER_INPUT_API_KEYS: + user_api_key = await TextService.get_user_api_key( + ctx.user.id, ctx, USER_KEY_DB + ) + if not user_api_key: + return + + await self.index_handler.set_link_index_recurse( + ctx, link, depth, user_api_key=user_api_key + ) + async def set_index_command( self, ctx, file: discord.Attachment = None, link: str = None ): diff --git a/models/index_model.py b/models/index_model.py index ad29ebf..082c49f 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -17,14 +17,14 @@ from datetime import date from discord import InteractionResponse, Interaction from discord.ext import pages -from gpt_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor +from llama_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor from langchain import OpenAI -from gpt_index.readers import YoutubeTranscriptReader -from gpt_index.readers.schema.base import Document -from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter +from llama_index.readers import YoutubeTranscriptReader +from llama_index.readers.schema.base import Document +from llama_index.langchain_helpers.text_splitter import TokenTextSplitter -from gpt_index import ( +from llama_index import ( GPTSimpleVectorIndex, SimpleDirectoryReader, QuestionAnswerPrompt, @@ -40,11 +40,11 @@ from gpt_index import ( IndexStructType, OpenAIEmbedding, GithubRepositoryReader, - MockEmbedding, + MockEmbedding, download_loader, ) -from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR +from llama_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR -from gpt_index.composability import ComposableGraph +from llama_index.composability import ComposableGraph from models.embed_statics_model import EmbedStatics from services.environment_service import EnvService, app_root_path @@ -52,6 +52,10 @@ from services.environment_service import EnvService, app_root_path SHORT_TO_LONG_CACHE = {} MAX_DEEP_COMPOSE_PRICE = EnvService.get_max_deep_compose_price() llm_predictor = ChatGPTLLMPredictor() +EpubReader = download_loader("EpubReader") +MarkdownReader = download_loader("MarkdownReader") +RemoteReader = download_loader("RemoteReader") +RemoteDepthReader = download_loader("RemoteDepthReader") def get_and_query( @@ -207,8 +211,18 @@ class Index_handler: return pages - def index_file(self, file_path, embed_model) -> GPTSimpleVectorIndex: - document = SimpleDirectoryReader(file_path).load_data() + def index_file(self, file_path, embed_model, suffix=None) -> GPTSimpleVectorIndex: + if suffix and suffix == ".md": + print("Loading a markdown file") + loader = MarkdownReader() + document = loader.load_data(file_path) + elif suffix and suffix == ".epub": + print("Loading an epub") + epub_loader = EpubReader() + print("The file path is ", file_path) + document = epub_loader.load_data(file_path) + else: + document = SimpleDirectoryReader(input_files=[file_path]).load_data() index = GPTSimpleVectorIndex(document, embed_model=embed_model, use_async=True) return index @@ -345,48 +359,64 @@ class Index_handler: else: os.environ["OPENAI_API_KEY"] = user_api_key + type_to_suffix_mappings = { + "text/plain": ".txt", + "text/csv": ".csv", + "application/pdf": ".pdf", + "application/json": ".json", + "image/png": ".png", + "image/": ".jpg", + "vnd.": ".pptx", + "audio/": ".mp3", + "video/": ".mp4", + "epub": ".epub", + "markdown": ".md", + "html": ".html", + } + + # For when content type doesnt get picked up by discord. + secondary_mappings = { + ".epub": ".epub", + } + try: - print(file.content_type) - if file.content_type.startswith("text/plain"): - suffix = ".txt" - elif file.content_type.startswith("application/pdf"): - suffix = ".pdf" - # Allow for images too - elif file.content_type.startswith("image/png"): - suffix = ".png" - elif file.content_type.startswith("image/"): - suffix = ".jpg" - elif "csv" in file.content_type: - suffix = ".csv" - elif "vnd." in file.content_type: - suffix = ".pptx" - # Catch all audio files and suffix with "mp3" - elif file.content_type.startswith("audio/"): - suffix = ".mp3" - # Catch video files - elif file.content_type.startswith("video/"): - pass # No suffix change + # First, initially set the suffix to the suffix of the attachment + suffix = None + if file.content_type: + # Apply the suffix mappings to the file + for key, value in type_to_suffix_mappings.items(): + if key in file.content_type: + suffix = value + break + + if not suffix: + await ctx.send("This file type is not supported.") + return + else: - await ctx.respond( - embed=EmbedStatics.get_index_set_failure_embed( - "Only accepts text, pdf, images, spreadheets, powerpoint, and audio/video files." - ) - ) - return + for key, value in secondary_mappings.items(): + if key in file.filename: + suffix = value + break + if not suffix: + await ctx.send("Could not determine the file type of the attachment, attempting a dirty index..") + return # Send indexing message response = await ctx.respond( embed=EmbedStatics.build_index_progress_embed() ) + print("The suffix is " + suffix) async with aiofiles.tempfile.TemporaryDirectory() as temp_path: async with aiofiles.tempfile.NamedTemporaryFile( suffix=suffix, dir=temp_path, delete=False ) as temp_file: + await file.save(temp_file.name) embedding_model = OpenAIEmbedding() index = await self.loop.run_in_executor( - None, partial(self.index_file, temp_path, embedding_model) + None, partial(self.index_file, Path(temp_file.name), embedding_model, suffix) ) await self.usage_service.update_usage( embedding_model.last_token_usage, embeddings=True @@ -411,6 +441,94 @@ class Index_handler: ) traceback.print_exc() + async def set_link_index_recurse( + self, ctx: discord.ApplicationContext, link: str, depth, user_api_key + ): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + + response = await ctx.respond(embed=EmbedStatics.build_index_progress_embed()) + try: + embedding_model = OpenAIEmbedding() + + # Pre-emptively connect and get the content-type of the response + try: + async with aiohttp.ClientSession() as session: + async with session.get(link, timeout=2) as _response: + print(_response.status) + if _response.status == 200: + content_type = _response.headers.get("content-type") + else: + await response.edit( + embed=EmbedStatics.get_index_set_failure_embed( + "Invalid URL or could not connect to the provided URL." + ) + ) + return + except Exception as e: + traceback.print_exc() + await response.edit( + embed=EmbedStatics.get_index_set_failure_embed( + "Invalid URL or could not connect to the provided URL. " + + str(e) + ) + ) + return + + # Check if the link contains youtube in it + loader = RemoteDepthReader(depth=depth) + documents = await self.loop.run_in_executor(None, partial(loader.load_data, [link])) + index = await self.loop.run_in_executor( + None, + functools.partial( + GPTSimpleVectorIndex, + documents=documents, + embed_model=embedding_model, + use_async=True, + ), + ) + + await self.usage_service.update_usage( + embedding_model.last_token_usage, embeddings=True + ) + + try: + price = await self.usage_service.get_price( + embedding_model.last_token_usage, embeddings=True + ) + except: + traceback.print_exc() + price = "Unknown" + + # Make the url look nice, remove https, useless stuff, random characters + file_name = ( + link.replace("https://", "") + .replace("http://", "") + .replace("www.", "") + .replace("/", "_") + .replace("?", "_") + .replace("&", "_") + .replace("=", "_") + .replace("-", "_") + .replace(".", "_") + ) + + self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) + + except ValueError as e: + await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e))) + traceback.print_exc() + return + + except Exception as e: + await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e))) + traceback.print_exc() + return + + await response.edit(embed=EmbedStatics.get_index_set_success_embed(price)) + async def set_link_index( self, ctx: discord.ApplicationContext, link: str, user_api_key ): diff --git a/models/search_model.py b/models/search_model.py index 05f8ab8..47facb6 100644 --- a/models/search_model.py +++ b/models/search_model.py @@ -11,7 +11,7 @@ from pathlib import Path import discord from bs4 import BeautifulSoup import aiohttp -from gpt_index import ( +from llama_index import ( QuestionAnswerPrompt, GPTSimpleVectorIndex, BeautifulSoupWebReader, @@ -24,10 +24,10 @@ from gpt_index import ( MockLLMPredictor, MockEmbedding, ) -from gpt_index.indices.knowledge_graph import GPTKnowledgeGraphIndex -from gpt_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor -from gpt_index.prompts.prompt_type import PromptType -from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR +from llama_index.indices.knowledge_graph import GPTKnowledgeGraphIndex +from llama_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor +from llama_index.prompts.prompt_type import PromptType +from llama_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR from langchain import OpenAI from services.environment_service import EnvService, app_root_path diff --git a/pyproject.toml b/pyproject.toml index 9d91b1d..861ab45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "backoff==2.2.1", "flask==2.2.3", "beautifulsoup4==4.11.1", -"gpt-index==0.4.18", +"llama-index==0.4.18", "PyPDF2==3.0.1", "youtube_transcript_api==0.5.0", "sentencepiece==0.1.97", diff --git a/requirements.txt b/requirements.txt index 34960e9..f5da4db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ Pillow==9.3.0 openai==0.27.0 pytube==12.1.2 py-cord==2.3.2 +beautifulsoup4==4.11.2 python-dotenv==0.21.0 requests==2.28.1 transformers==4.25.1 @@ -12,7 +13,7 @@ sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.3 beautifulsoup4==4.11.1 -gpt-index==0.4.18 +llama-index==0.4.18 PyPDF2==3.0.1 youtube_transcript_api==0.5.0 sentencepiece==0.1.97 diff --git a/requirements_base.txt b/requirements_base.txt index c10c958..187af41 100644 --- a/requirements_base.txt +++ b/requirements_base.txt @@ -2,6 +2,7 @@ Pillow==9.3.0 openai==0.27.0 pytube==12.1.2 py-cord==2.3.2 +beautifulsoup4==4.11.2 python-dotenv==0.21.0 requests==2.28.1 transformers==4.25.1 @@ -12,7 +13,7 @@ sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.3 beautifulsoup4==4.11.1 -gpt-index==0.4.18 +llama-index==0.4.18 PyPDF2==3.0.1 youtube_transcript_api==0.5.0 sentencepiece==0.1.97