From 6941cc30b9ab9a0fa92f7c8579759149969b4643 Mon Sep 17 00:00:00 2001
From: Kaveen Kumarasinghe <ocfinancesmc@gmail.com>
Date: Thu, 2 Mar 2023 12:37:25 -0500
Subject: [PATCH] add some more gpt index readers

---
 cogs/commands.py          |  24 +++++
 cogs/index_service_cog.py |  22 +++++
 models/index_model.py     | 192 ++++++++++++++++++++++++++++++--------
 models/search_model.py    |  10 +-
 pyproject.toml            |   2 +-
 requirements.txt          |   3 +-
 requirements_base.txt     |   3 +-
 7 files changed, 211 insertions(+), 45 deletions(-)

diff --git a/cogs/commands.py b/cogs/commands.py
index f62796c..99e50e5 100644
--- a/cogs/commands.py
+++ b/cogs/commands.py
@@ -682,6 +682,30 @@ class Commands(discord.Cog, name="Commands"):
     ):
         await self.index_cog.set_index_command(ctx, file, link)
 
+    @add_to_group("index")
+    @discord.slash_command(
+        name="recurse-link", description="Recursively index a link", guild_ids=ALLOWED_GUILDS
+    )
+    @discord.guild_only()
+    @discord.option(
+        name="link",
+        description="A link to create the index from",
+        required=True,
+        input_type=discord.SlashCommandOptionType.string,
+    )
+    @discord.option(
+        name="depth",
+        description="How deep to recurse",
+        required=False,
+        input_type=discord.SlashCommandOptionType.integer,
+        min_value=1,
+        max_value=5,
+    )
+    async def set_recurse_link(
+        self, ctx: discord.ApplicationContext, link: str, depth: int
+    ):
+        await self.index_cog.set_index_link_recurse_command(ctx, link, depth)
+
     @add_to_group("index")
     @discord.slash_command(
         name="reset",
diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py
index e3a35f2..414b9fb 100644
--- a/cogs/index_service_cog.py
+++ b/cogs/index_service_cog.py
@@ -120,6 +120,28 @@ class IndexService(discord.Cog, name="IndexService"):
                 )
             )
 
+    async def set_index_link_recurse_command(
+            self, ctx, link: str = None, depth:int = 1
+    ):
+        await ctx.defer()
+        """Command handler to set a file as your personal index"""
+        if not link:
+            await ctx.respond("Please provide a link")
+            return
+
+
+        user_api_key = None
+        if USER_INPUT_API_KEYS:
+            user_api_key = await TextService.get_user_api_key(
+                ctx.user.id, ctx, USER_KEY_DB
+            )
+            if not user_api_key:
+                return
+
+        await self.index_handler.set_link_index_recurse(
+            ctx, link, depth, user_api_key=user_api_key
+        )
+
     async def set_index_command(
         self, ctx, file: discord.Attachment = None, link: str = None
     ):
diff --git a/models/index_model.py b/models/index_model.py
index ad29ebf..082c49f 100644
--- a/models/index_model.py
+++ b/models/index_model.py
@@ -17,14 +17,14 @@ from datetime import date
 
 from discord import InteractionResponse, Interaction
 from discord.ext import pages
-from gpt_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor
+from llama_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor
 from langchain import OpenAI
 
-from gpt_index.readers import YoutubeTranscriptReader
-from gpt_index.readers.schema.base import Document
-from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter
+from llama_index.readers import YoutubeTranscriptReader
+from llama_index.readers.schema.base import Document
+from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
 
-from gpt_index import (
+from llama_index import (
     GPTSimpleVectorIndex,
     SimpleDirectoryReader,
     QuestionAnswerPrompt,
@@ -40,11 +40,11 @@ from gpt_index import (
     IndexStructType,
     OpenAIEmbedding,
     GithubRepositoryReader,
-    MockEmbedding,
+    MockEmbedding, download_loader,
 )
-from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR
+from llama_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR
 
-from gpt_index.composability import ComposableGraph
+from llama_index.composability import ComposableGraph
 
 from models.embed_statics_model import EmbedStatics
 from services.environment_service import EnvService, app_root_path
@@ -52,6 +52,10 @@ from services.environment_service import EnvService, app_root_path
 SHORT_TO_LONG_CACHE = {}
 MAX_DEEP_COMPOSE_PRICE = EnvService.get_max_deep_compose_price()
 llm_predictor = ChatGPTLLMPredictor()
+EpubReader = download_loader("EpubReader")
+MarkdownReader = download_loader("MarkdownReader")
+RemoteReader = download_loader("RemoteReader")
+RemoteDepthReader = download_loader("RemoteDepthReader")
 
 
 def get_and_query(
@@ -207,8 +211,18 @@ class Index_handler:
 
         return pages
 
-    def index_file(self, file_path, embed_model) -> GPTSimpleVectorIndex:
-        document = SimpleDirectoryReader(file_path).load_data()
+    def index_file(self, file_path, embed_model, suffix=None) -> GPTSimpleVectorIndex:
+        if suffix and suffix == ".md":
+            print("Loading a markdown file")
+            loader = MarkdownReader()
+            document = loader.load_data(file_path)
+        elif suffix and suffix == ".epub":
+            print("Loading an epub")
+            epub_loader = EpubReader()
+            print("The file path is ", file_path)
+            document = epub_loader.load_data(file_path)
+        else:
+            document = SimpleDirectoryReader(input_files=[file_path]).load_data()
         index = GPTSimpleVectorIndex(document, embed_model=embed_model, use_async=True)
         return index
 
@@ -345,48 +359,64 @@ class Index_handler:
         else:
             os.environ["OPENAI_API_KEY"] = user_api_key
 
+        type_to_suffix_mappings = {
+            "text/plain": ".txt",
+            "text/csv": ".csv",
+            "application/pdf": ".pdf",
+            "application/json": ".json",
+            "image/png": ".png",
+            "image/": ".jpg",
+            "vnd.": ".pptx",
+            "audio/": ".mp3",
+            "video/": ".mp4",
+            "epub": ".epub",
+            "markdown": ".md",
+            "html": ".html",
+        }
+
+        # For when content type doesnt get picked up by discord.
+        secondary_mappings = {
+            ".epub": ".epub",
+        }
+
         try:
-            print(file.content_type)
-            if file.content_type.startswith("text/plain"):
-                suffix = ".txt"
-            elif file.content_type.startswith("application/pdf"):
-                suffix = ".pdf"
-            # Allow for images too
-            elif file.content_type.startswith("image/png"):
-                suffix = ".png"
-            elif file.content_type.startswith("image/"):
-                suffix = ".jpg"
-            elif "csv" in file.content_type:
-                suffix = ".csv"
-            elif "vnd." in file.content_type:
-                suffix = ".pptx"
-            # Catch all audio files and suffix with "mp3"
-            elif file.content_type.startswith("audio/"):
-                suffix = ".mp3"
-            # Catch video files
-            elif file.content_type.startswith("video/"):
-                pass  # No suffix change
+            # First, initially set the suffix to the suffix of the attachment
+            suffix = None
+            if file.content_type:
+                # Apply the suffix mappings to the file
+                for key, value in type_to_suffix_mappings.items():
+                    if key in file.content_type:
+                        suffix = value
+                        break
+
+                if not suffix:
+                    await ctx.send("This file type is not supported.")
+                    return
+
             else:
-                await ctx.respond(
-                    embed=EmbedStatics.get_index_set_failure_embed(
-                        "Only accepts text, pdf, images, spreadheets, powerpoint, and audio/video files."
-                    )
-                )
-                return
+                for key, value in secondary_mappings.items():
+                    if key in file.filename:
+                        suffix = value
+                        break
+                if not suffix:
+                    await ctx.send("Could not determine the file type of the attachment, attempting a dirty index..")
+                    return
 
             # Send indexing message
             response = await ctx.respond(
                 embed=EmbedStatics.build_index_progress_embed()
             )
+            print("The suffix is " + suffix)
 
             async with aiofiles.tempfile.TemporaryDirectory() as temp_path:
                 async with aiofiles.tempfile.NamedTemporaryFile(
                     suffix=suffix, dir=temp_path, delete=False
                 ) as temp_file:
+
                     await file.save(temp_file.name)
                     embedding_model = OpenAIEmbedding()
                     index = await self.loop.run_in_executor(
-                        None, partial(self.index_file, temp_path, embedding_model)
+                        None, partial(self.index_file, Path(temp_file.name), embedding_model, suffix)
                     )
                     await self.usage_service.update_usage(
                         embedding_model.last_token_usage, embeddings=True
@@ -411,6 +441,94 @@ class Index_handler:
             )
             traceback.print_exc()
 
+    async def set_link_index_recurse(
+        self, ctx: discord.ApplicationContext, link: str, depth, user_api_key
+    ):
+        if not user_api_key:
+            os.environ["OPENAI_API_KEY"] = self.openai_key
+        else:
+            os.environ["OPENAI_API_KEY"] = user_api_key
+
+        response = await ctx.respond(embed=EmbedStatics.build_index_progress_embed())
+        try:
+            embedding_model = OpenAIEmbedding()
+
+            # Pre-emptively connect and get the content-type of the response
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(link, timeout=2) as _response:
+                        print(_response.status)
+                        if _response.status == 200:
+                            content_type = _response.headers.get("content-type")
+                        else:
+                            await response.edit(
+                                embed=EmbedStatics.get_index_set_failure_embed(
+                                    "Invalid URL or could not connect to the provided URL."
+                                )
+                            )
+                            return
+            except Exception as e:
+                traceback.print_exc()
+                await response.edit(
+                    embed=EmbedStatics.get_index_set_failure_embed(
+                        "Invalid URL or could not connect to the provided URL. "
+                        + str(e)
+                    )
+                )
+                return
+
+            # Check if the link contains youtube in it
+            loader = RemoteDepthReader(depth=depth)
+            documents = await self.loop.run_in_executor(None, partial(loader.load_data, [link]))
+            index = await self.loop.run_in_executor(
+                None,
+                functools.partial(
+                    GPTSimpleVectorIndex,
+                    documents=documents,
+                    embed_model=embedding_model,
+                    use_async=True,
+                ),
+            )
+
+            await self.usage_service.update_usage(
+                embedding_model.last_token_usage, embeddings=True
+            )
+
+            try:
+                price = await self.usage_service.get_price(
+                    embedding_model.last_token_usage, embeddings=True
+                )
+            except:
+                traceback.print_exc()
+                price = "Unknown"
+
+            # Make the url look nice, remove https, useless stuff, random characters
+            file_name = (
+                link.replace("https://", "")
+                .replace("http://", "")
+                .replace("www.", "")
+                .replace("/", "_")
+                .replace("?", "_")
+                .replace("&", "_")
+                .replace("=", "_")
+                .replace("-", "_")
+                .replace(".", "_")
+            )
+
+            self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name)
+
+        except ValueError as e:
+            await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e)))
+            traceback.print_exc()
+            return
+
+        except Exception as e:
+            await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e)))
+            traceback.print_exc()
+            return
+
+        await response.edit(embed=EmbedStatics.get_index_set_success_embed(price))
+
     async def set_link_index(
         self, ctx: discord.ApplicationContext, link: str, user_api_key
     ):
diff --git a/models/search_model.py b/models/search_model.py
index 05f8ab8..47facb6 100644
--- a/models/search_model.py
+++ b/models/search_model.py
@@ -11,7 +11,7 @@ from pathlib import Path
 import discord
 from bs4 import BeautifulSoup
 import aiohttp
-from gpt_index import (
+from llama_index import (
     QuestionAnswerPrompt,
     GPTSimpleVectorIndex,
     BeautifulSoupWebReader,
@@ -24,10 +24,10 @@ from gpt_index import (
     MockLLMPredictor,
     MockEmbedding,
 )
-from gpt_index.indices.knowledge_graph import GPTKnowledgeGraphIndex
-from gpt_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor
-from gpt_index.prompts.prompt_type import PromptType
-from gpt_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR
+from llama_index.indices.knowledge_graph import GPTKnowledgeGraphIndex
+from llama_index.langchain_helpers.chatgpt import ChatGPTLLMPredictor
+from llama_index.prompts.prompt_type import PromptType
+from llama_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR
 from langchain import OpenAI
 
 from services.environment_service import EnvService, app_root_path
diff --git a/pyproject.toml b/pyproject.toml
index 9d91b1d..861ab45 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
 "backoff==2.2.1",
 "flask==2.2.3",
 "beautifulsoup4==4.11.1",
-"gpt-index==0.4.18",
+"llama-index==0.4.18",
 "PyPDF2==3.0.1",
 "youtube_transcript_api==0.5.0",
 "sentencepiece==0.1.97",
diff --git a/requirements.txt b/requirements.txt
index 34960e9..f5da4db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ Pillow==9.3.0
 openai==0.27.0
 pytube==12.1.2
 py-cord==2.3.2
+beautifulsoup4==4.11.2
 python-dotenv==0.21.0
 requests==2.28.1
 transformers==4.25.1
@@ -12,7 +13,7 @@ sqlitedict==2.1.0
 backoff==2.2.1
 flask==2.2.3
 beautifulsoup4==4.11.1
-gpt-index==0.4.18
+llama-index==0.4.18
 PyPDF2==3.0.1
 youtube_transcript_api==0.5.0
 sentencepiece==0.1.97
diff --git a/requirements_base.txt b/requirements_base.txt
index c10c958..187af41 100644
--- a/requirements_base.txt
+++ b/requirements_base.txt
@@ -2,6 +2,7 @@ Pillow==9.3.0
 openai==0.27.0
 pytube==12.1.2
 py-cord==2.3.2
+beautifulsoup4==4.11.2
 python-dotenv==0.21.0
 requests==2.28.1
 transformers==4.25.1
@@ -12,7 +13,7 @@ sqlitedict==2.1.0
 backoff==2.2.1
 flask==2.2.3
 beautifulsoup4==4.11.1
-gpt-index==0.4.18
+llama-index==0.4.18
 PyPDF2==3.0.1
 youtube_transcript_api==0.5.0
 sentencepiece==0.1.97