fine tune moderations a bit, fix Redo

2 years ago · 9896051a7b
parent 602742eb59
commit 9896051a7b
3 changed files with 13 additions and 5 deletions
--- a/cogs/gpt_3_commands_and_converser.py
+++ b/cogs/gpt_3_commands_and_converser.py
@ -1219,7 +1219,7 @@ class RedoButton(discord.ui.Button["RedoView"]):
            )

            await self.converser_cog.encapsulated_send(
-                user_id, prompt, ctx, response_message
+                user_id=user_id, prompt=prompt, ctx=ctx, response_message=response_message
            )
        else:
            await interaction.response.send_message(
--- a/cogs/image_prompt_optimizer.py
+++ b/cogs/image_prompt_optimizer.py
@ -215,7 +215,7 @@ class RedoButton(discord.ui.Button["OptimizeView"]):
                "Redoing your original request...", ephemeral=True, delete_after=20
            )
            await self.converser_cog.encapsulated_send(
-                user_id, prompt, ctx, response_message
+                user_id=user_id, prompt=prompt, ctx=ctx, response_message=response_message
            )
        else:
            await interaction.response.send_message(
--- a/models/moderations_service_model.py
+++ b/models/moderations_service_model.py
@ -51,15 +51,17 @@ class Moderation:
        return embed

    @staticmethod
-    def determine_moderation_result(response):
+    def determine_moderation_result(text,response):
        HATE_THRESHOLD = 0.005
        HATE_VIOLENCE_THRESHOLD = 0.05
        SELF_HARM_THRESHOLD = 0.05
        SEXUAL_THRESHOLD = 0.75
        SEXUAL_MINORS_THRESHOLD = 0.1
-        VIOLENCE_THRESHOLD = 0.01
+        VIOLENCE_THRESHOLD = 0.08
        VIOLENCE_GRAPHIC_THRESHOLD = 0.1

+        extreme_hatred_qualifiers = ["i fucking hate", "fucking hate", "i fucking despise"]
+
        thresholds = [
            HATE_THRESHOLD,
            HATE_VIOLENCE_THRESHOLD,
@ -85,6 +87,12 @@ class Moderation:

        # Iterate the category scores using the threshold_iterator and compare the values to thresholds
        for category, threshold in zip(threshold_iterator, thresholds):
+            if category == "hate":
+                if "hate" in text.lower(): # The word "hate" makes the model oversensitive. This is a (bad) workaround.
+                    threshold = 0.1
+                if any(word in text.lower() for word in extreme_hatred_qualifiers):
+                    threshold = 0.6
+
            if category_scores[category] > threshold:
                return True

@ -110,7 +118,7 @@ class Moderation:
                    response = await model.send_moderations_request(
                        to_moderate.message.content
                    )
-                    moderation_result = Moderation.determine_moderation_result(response)
+                    moderation_result = Moderation.determine_moderation_result(to_moderate.message.content,response)

                    if moderation_result:
                        # Take care of the flagged message