Upgrade moderations service, double thresholds, admin options

2 years ago · 7b42bc4afe
parent 5c556c9952
commit 7b42bc4afe
3 changed files with 134 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -16,6 +16,7 @@
 <img src="https://i.imgur.com/KeLpDgj.png"/>
 <img  src="https://i.imgur.com/jLp1T0h.png"/>
 <img src="https://i.imgur.com/9XC95Lu.png"/>
+<img src="https://i.imgur.com/HqFSFcc.png"/>

 </p>

@ -118,6 +119,7 @@ These commands are grouped, so each group has a prefix but you can easily tab co
 - The bot needs Administrative permissions for this, and you need to set `MODERATIONS_ALERT_CHANNEL` to the channel ID of a desired channel in your .env file if you want to receive alerts about moderated messages.
 - This uses the OpenAI Moderations endpoint to check for messages, requests are only sent to the moderations endpoint at a MINIMUM request gap of 0.5 seconds, to ensure you don't get blocked and to ensure reliability. 
 - The bot uses numerical thresholds to determine whether a message is toxic or not, and I have manually tested and fine tuned these thresholds to a point that I think is good, please open an issue if you have any suggestions for the thresholds!
+- There are two thresholds for the bot, there are instances in which the bot will outright delete a message and an instance where the bot will send a message to the alert channel notifying admins and giving them quick options to delete and timeout the user (check out the screenshots at the beginning of the README to see this).

 # Permanent Memory
 Permanent memory has now been implemented into the bot, using the OpenAI Ada embeddings endpoint, and Pinecone DB.
--- a/gpt3discord.py
+++ b/gpt3discord.py
@ -24,7 +24,7 @@ from models.openai_model import Model
 from models.usage_service_model import UsageService
 from models.env_service_model import EnvService

-__version__ = "4.1"
+__version__ = "4.2"

 """
 The pinecone service is used to store and retrieve conversation embeddings.
--- a/models/moderations_service_model.py
+++ b/models/moderations_service_model.py
@ -1,7 +1,7 @@
 import asyncio
 import os
 import traceback
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path

 import discord
@ -12,6 +12,42 @@ from models.usage_service_model import UsageService
 usage_service = UsageService(Path(os.environ.get("DATA_DIR", os.getcwd())))
 model = Model(usage_service)

+class ModerationResult:
+    WARN = "warn"
+    DELETE = "delete"
+    NONE = "none"
+
+class ThresholdSet:
+
+    def __init__(self, h_t, hv_t, sh_t, s_t, sm_t, v_t, vg_t):
+        self.keys = [
+            "hate",
+            "hate/threatening",
+            "self-harm",
+            "sexual",
+            "sexual/minors",
+            "violence",
+            "violence/graphic"
+        ]
+        self.thresholds = [
+            h_t,
+            hv_t,
+            sh_t,
+            s_t,
+            sm_t,
+            v_t,
+            vg_t,
+        ]
+
+    def moderate(self, text, response_message):
+        category_scores = response_message["results"][0]["category_scores"]
+        flagged = response_message["results"][0]["flagged"]
+
+        for category, threshold in zip(self.keys, self.thresholds):
+            if category_scores[category] > threshold:
+                return (True, flagged)
+        return (False, flagged)
+

 class Moderation:
    def __init__(self, message, timestamp):
@ -33,6 +69,24 @@ class Moderation:
        )
        return embed

+    @staticmethod
+    def build_admin_warning_message(moderated_message, deleted_message=None, timed_out=None):
+        embed = discord.Embed(
+            title="Potentially unwanted message in the "
+            + moderated_message.guild.name
+            + " server",
+            description=f"**Message from {moderated_message.author.mention}:** {moderated_message.content}",
+            colour=discord.Colour.yellow(),
+        )
+        link = f"https://discord.com/channels/{moderated_message.guild.id}/{moderated_message.channel.id}/{moderated_message.id}"
+        embed.add_field(name="Message link", value=link, inline=False)
+        if deleted_message:
+            embed.add_field(name="Message deleted by: ", value=deleted_message, inline=False)
+        if timed_out:
+            embed.add_field(name="User timed out by: ", value=timed_out, inline=False)
+        return embed
+
+
    @staticmethod
    def build_admin_moderated_message(moderated_message, response_message):

@ -52,57 +106,20 @@ class Moderation:

    @staticmethod
    def determine_moderation_result(text, response):
-        HATE_THRESHOLD = 0.005
-        HATE_VIOLENCE_THRESHOLD = 0.05
-        SELF_HARM_THRESHOLD = 0.05
-        SEXUAL_THRESHOLD = 0.91
-        SEXUAL_MINORS_THRESHOLD = 0.1
-        VIOLENCE_THRESHOLD = 0.08
-        VIOLENCE_GRAPHIC_THRESHOLD = 0.1
-
-        extreme_hatred_qualifiers = [
-            "i fucking hate",
-            "fucking hate",
-            "i fucking despise",
-        ]
+        warn_set = ThresholdSet(0.005,0.05,0.05,0.91,0.1,0.08,0.1)
+        delete_set = ThresholdSet(0.1,0.1,0.1,0.95,0.2,0.6,0.4)

-        thresholds = [
-            HATE_THRESHOLD,
-            HATE_VIOLENCE_THRESHOLD,
-            SELF_HARM_THRESHOLD,
-            SEXUAL_THRESHOLD,
-            SEXUAL_MINORS_THRESHOLD,
-            VIOLENCE_THRESHOLD,
-            VIOLENCE_GRAPHIC_THRESHOLD,
-        ]
-        threshold_iterator = [
-            "hate",
-            "hate/threatening",
-            "self-harm",
-            "sexual",
-            "sexual/minors",
-            "violence",
-            "violence/graphic",
-        ]
-
-        category_scores = response["results"][0]["category_scores"]
-
-        flagged = response["results"][0]["flagged"]
+        warn_result, flagged_warn = warn_set.moderate(text, response)
+        delete_result, flagged_delete = delete_set.moderate(text, response)

-        # Iterate the category scores using the threshold_iterator and compare the values to thresholds
-        for category, threshold in zip(threshold_iterator, thresholds):
-            if category == "hate":
-                if (
-                    "hate" in text.lower()
-                ):  # The word "hate" makes the model oversensitive. This is a (bad) workaround.
-                    threshold = 0.1
-                if any(word in text.lower() for word in extreme_hatred_qualifiers):
-                    threshold = 0.6
+        if delete_result:
+            return ModerationResult.DELETE
+        elif warn_result:
+            return ModerationResult.WARN
+        else:
+            return ModerationResult.NONE

-            if category_scores[category] > threshold:
-                return True

-        return False

    # This function will be called by the bot to process the message queue
    @staticmethod
@ -128,7 +145,7 @@ class Moderation:
                        to_moderate.message.content, response
                    )

-                    if moderation_result:
+                    if moderation_result == ModerationResult.DELETE:
                        # Take care of the flagged message
                        response_message = await to_moderate.message.reply(
                            embed=Moderation.build_moderation_embed()
@ -143,6 +160,11 @@ class Moderation:
                                    to_moderate, response_message
                                )
                            )
+                    elif moderation_result == ModerationResult.WARN:
+                        response_message = await moderations_alert_channel.send(
+                            embed=Moderation.build_admin_warning_message(to_moderate.message),
+                        )
+                        await response_message.edit(view=ModerationAdminView(to_moderate.message, response_message))

                else:
                    await moderation_queue.put(to_moderate)
@ -152,3 +174,63 @@ class Moderation:
            except:
                traceback.print_exc()
                pass
+
+
+class ModerationAdminView(discord.ui.View):
+    def __init__(self, message, moderation_message, nodelete=False):
+        super().__init__(timeout=None)  # 1 hour interval to redo.
+        self.message = message
+        self.moderation_message = moderation_message,
+        if not nodelete:
+            self.add_item(DeleteMessageButton(self.message, self.moderation_message))
+        self.add_item(TimeoutUserButton(self.message, self.moderation_message, 1, nodelete))
+        self.add_item(TimeoutUserButton(self.message, self.moderation_message, 6, nodelete))
+        self.add_item(TimeoutUserButton(self.message, self.moderation_message, 12, nodelete))
+        self.add_item(TimeoutUserButton(self.message, self.moderation_message, 24, nodelete))
+
+
+class DeleteMessageButton(discord.ui.Button["ModerationAdminView"]):
+    def __init__(self, message, moderation_message):
+        super().__init__(style=discord.ButtonStyle.danger, label="Delete Message")
+        self.message = message
+        self.moderation_message = moderation_message
+
+    async def callback(self, interaction: discord.Interaction):
+
+        # Get the user
+        await self.message.delete()
+        await interaction.response.send_message(
+            "This message was deleted", ephemeral=True, delete_after=10
+        )
+        await self.moderation_message[0].edit(embed=Moderation.build_admin_warning_message(self.message, deleted_message=interaction.user.mention),
+                                              view=ModerationAdminView(self.message, self.moderation_message, nodelete=True))
+
+
+class TimeoutUserButton(discord.ui.Button["ModerationAdminView"]):
+    def __init__(self, message, moderation_message, hours, nodelete):
+        super().__init__(style=discord.ButtonStyle.danger, label=f"Timeout {hours}h")
+        self.message = message
+        self.moderation_message = moderation_message
+        self.hours = hours
+        self.nodelete = nodelete
+
+    async def callback(self, interaction: discord.Interaction):
+        # Get the user id
+        try:
+            await self.message.delete()
+        except:
+            pass
+
+        try:
+            await self.message.author.timeout(until = discord.utils.utcnow() + timedelta(hours=self.hours), reason="Breaking the server chat rules")
+        except Exception as e:
+            traceback.print_exc()
+            pass
+
+        await interaction.response.send_message(
+            f"This user was timed out for {self.hours} hour(s)", ephemeral=True, delete_after=10
+        )
+        moderation_message = self.moderation_message[0][0] if self.nodelete else self.moderation_message[0]
+        await moderation_message.edit(embed=Moderation.build_admin_warning_message(self.message, deleted_message=interaction.user.mention, timed_out=interaction.user.mention),
+                                      view=ModerationAdminView(self.message, self.moderation_message, nodelete=True))
+