From 8825b9e7f31c9ee291d9a028303120c360b87f43 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 15 Jan 2023 00:35:31 -0500 Subject: [PATCH 1/2] parameterize moderations --- README.md | 15 +++- cogs/commands.py | 53 +++++++++++ cogs/moderations_service_cog.py | 152 ++++++++++++++++++++++++++++---- cogs/text_service_cog.py | 2 +- services/moderations_service.py | 15 ++-- 5 files changed, 209 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index bf827e9..c98df5e 100644 --- a/README.md +++ b/README.md @@ -127,10 +127,17 @@ These commands are grouped, so each group has a prefix but you can easily tab co `/mod set status:off alert_channel_id:` - Turn on moderations and set the alert channel to the channel ID you specify in the command. -- The bot needs Administrative permissions for this, and you need to set `MODERATIONS_ALERT_CHANNEL` to the channel ID of a desired channel in your .env file if you want to receive alerts about moderated messages. -- This uses the OpenAI Moderations endpoint to check for messages, requests are only sent to the moderations endpoint at a MINIMUM request gap of 0.5 seconds, to ensure you don't get blocked and to ensure reliability. -- The bot uses numerical thresholds to determine whether a message is toxic or not, and I have manually tested and fine tuned these thresholds to a point that I think is good, please open an issue if you have any suggestions for the thresholds! -- There are two thresholds for the bot, there are instances in which the bot will outright delete a message and an instance where the bot will send a message to the alert channel notifying admins and giving them quick options to delete and timeout the user (check out the screenshots at the beginning of the README to see this). +`/mod config type: hate:# hate_threatening:# self_harm:# sexual:# sexual_minors:# violence:# violence_graphic:#` +- Set the moderation thresholds of the bot for the specific type of moderation (`warn` or `delete`). You can view the thresholds by typing just `/mod config type:` without any other parameters. You don't have to set all of them, you can just set one or two items if you want. For example, to set the hate threshold for warns, you can type `/mod config type:warn hate:0.2` +- Lower values are more strict, higher values are more lenient. There are default values that I've fine tuned the service with for a general server. + +The bot needs Administrative permissions for this, and you need to set `MODERATIONS_ALERT_CHANNEL` to the channel ID of a desired channel in your .env file if you want to receive alerts about moderated messages. + +This uses the OpenAI Moderations endpoint to check for messages, requests are only sent to the moderations endpoint at a MINIMUM request gap of 0.5 seconds, to ensure you don't get blocked and to ensure reliability. + +The bot uses numerical thresholds to determine whether a message is toxic or not, and I have manually tested and fine tuned these thresholds to a point that I think is good, please open an issue if you have any suggestions for the thresholds! + +There are two thresholds for the bot, there are instances in which the bot will outright delete a message and an instance where the bot will send a message to the alert channel notifying admins and giving them quick options to delete and timeout the user (check out the screenshots at the beginning of the README to see this). If you'd like to help us test and fine tune our thresholds for the moderation service, please join this test server: https://discord.gg/CWhsSgNdrP. You can let off some steam in a controlled environment ;) diff --git a/cogs/commands.py b/cogs/commands.py index ec03909..67f7cae 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -180,6 +180,59 @@ class Commands(discord.Cog, name="Commands"): ): await self.moderations_cog.moderations_command(ctx, status, alert_channel_id) + @add_to_group("mod") + @discord.slash_command( + name="config", + description="Configure the moderations service for the current guild. Lower # = more strict", + guild_ids=ALLOWED_GUILDS, + ) + @discord.option( + name="type", + description="The type of moderation to configure ('warn' or 'delete')", + required=True, + ) + @discord.option( + name="hate", + description="The threshold for hate speech", + required=False, + ) + @discord.option( + name="hate_threatening", + description="The threshold for hate/threatening speech", + required=False, + ) + @discord.option( + name="self_harm", + description="The threshold for self_harm speech", + required=False, + ) + @discord.option( + name="sexual", + description="The threshold for sexual speech", + required=False, + ) + @discord.option( + name="sexual_minors", + description="The threshold for sexual speech with minors in context", + required=False, + ) + @discord.option( + name="violence", + description="The threshold for violent speech", + required=False, + ) + @discord.option( + name="violence_graphic", + description="The threshold for violent and graphic speech", + required=False, + ) + @discord.guild_only() + async def config( + self, ctx: discord.ApplicationContext, type: str, hate: str, hate_threatening: str, self_harm: str, sexual: str, sexual_minors: str, violence: str, violence_graphic: str + ): + await self.moderations_cog.config_command(ctx, type, hate, hate_threatening, self_harm, sexual, sexual_minors, violence, violence_graphic) + + """ GPT commands """ diff --git a/cogs/moderations_service_cog.py b/cogs/moderations_service_cog.py index a65f074..9fe9288 100644 --- a/cogs/moderations_service_cog.py +++ b/cogs/moderations_service_cog.py @@ -4,7 +4,7 @@ import discord from sqlitedict import SqliteDict from services.environment_service import EnvService -from services.moderations_service import Moderation +from services.moderations_service import Moderation, ThresholdSet MOD_DB = None try: @@ -34,11 +34,18 @@ class ModerationsService(discord.Cog, name="ModerationsService"): self.moderation_tasks = {} self.moderations_launched = [] + # Defaults + self.default_warn_set = ThresholdSet(0.01, 0.05, 0.05, 0.91, 0.1, 0.45, 0.1) + self.default_delete_set = ThresholdSet(0.26, 0.26, 0.1, 0.95, 0.03, 0.85, 0.4) + @discord.Cog.listener() async def on_ready(self): # Check moderation service for each guild for guild in self.bot.guilds: + self.get_or_set_warn_set(guild.id) + self.get_or_set_delete_set(guild.id) await self.check_and_launch_moderations(guild.id) + print("The moderation service is ready.") def check_guild_moderated(self, guild_id): return guild_id in MOD_DB and MOD_DB[guild_id]["moderated"] @@ -50,6 +57,35 @@ class ModerationsService(discord.Cog, name="ModerationsService"): MOD_DB[guild_id] = {"moderated": True, "alert_channel": channel_id} MOD_DB.commit() + def get_or_set_warn_set(self, guild_id): + guild_id = str(guild_id) + key = guild_id + "_warn_set" + if key not in MOD_DB: + MOD_DB[key] = zip(self.default_warn_set.keys, self.default_warn_set.thresholds) + MOD_DB.commit() + return dict(MOD_DB[key]) + + def get_or_set_delete_set(self, guild_id): + guild_id=str(guild_id) + key = guild_id + "_delete_set" + if key not in MOD_DB: + MOD_DB[key] = zip(self.default_delete_set.keys, self.default_delete_set.thresholds) + MOD_DB.commit() + return dict(MOD_DB[key]) + + def set_warn_set(self, guild_id, threshold_set): + guild_id = str(guild_id) + key = guild_id + "_warn_set" + MOD_DB[key] = zip(threshold_set.keys, threshold_set.thresholds) + MOD_DB.commit() + + def set_delete_set(self, guild_id, threshold_set): + guild_id = str(guild_id) + key = guild_id + "_delete_set" + MOD_DB[key] = zip(threshold_set.keys, threshold_set.thresholds) + MOD_DB.commit() + + def set_guild_moderated(self, guild_id, status=True): if guild_id not in MOD_DB: MOD_DB[guild_id] = {"moderated": status, "alert_channel": 0} @@ -72,10 +108,15 @@ class ModerationsService(discord.Cog, name="ModerationsService"): if not alert_channel_override else alert_channel_override ) + warn_set_nums = self.get_or_set_warn_set(guild_id).values() + delete_set_nums = self.get_or_set_delete_set(guild_id).values() + warn_set = ThresholdSet(*warn_set_nums) + delete_set = ThresholdSet(*delete_set_nums) Moderation.moderation_tasks[guild_id] = asyncio.ensure_future( Moderation.process_moderation_queue( - Moderation.moderation_queues[guild_id], 1, 1, moderations_channel + Moderation.moderation_queues[guild_id], 1, 1, moderations_channel, + warn_set, delete_set ) ) print("Launched the moderations service for guild " + str(guild_id)) @@ -101,25 +142,100 @@ class ModerationsService(discord.Cog, name="ModerationsService"): return # Create the moderations service. - self.set_guild_moderated(ctx.guild_id) - moderations_channel = await self.check_and_launch_moderations( - ctx.guild_id, - Moderation.moderation_alerts_channel - if not alert_channel_id - else alert_channel_id, - ) - self.set_moderated_alert_channel(ctx.guild_id, moderations_channel.id) - - await ctx.respond("Moderations service enabled") + await self.start_moderations_service(guild_id=ctx.guild_id, alert_channel_id=alert_channel_id) + await ctx.respond("Moderations is now enabled for this guild") elif status == "off": # Cancel the moderations service. - self.set_guild_moderated(ctx.guild_id, False) - Moderation.moderation_tasks[ctx.guild_id].cancel() - Moderation.moderation_tasks[ctx.guild_id] = None - Moderation.moderation_queues[ctx.guild_id] = None - Moderation.moderations_launched.remove(ctx.guild_id) - await ctx.respond("Moderations service disabled") + await self.stop_moderations_service(ctx.guild_id) + await ctx.respond("Moderations is now disabled for this guild", ephemeral=True) + + async def stop_moderations_service(self, guild_id): + self.set_guild_moderated(guild_id, False) + Moderation.moderation_tasks[guild_id].cancel() + Moderation.moderation_tasks[guild_id] = None + Moderation.moderation_queues[guild_id] = None + Moderation.moderations_launched.remove(guild_id) + + + async def start_moderations_service(self, guild_id, alert_channel_id=None): + self.set_guild_moderated(guild_id) + moderations_channel = await self.check_and_launch_moderations( + guild_id, + Moderation.moderation_alerts_channel + if not alert_channel_id + else alert_channel_id, + ) + self.set_moderated_alert_channel(guild_id, moderations_channel.id) + + async def restart_moderations_service(self, ctx): + await ctx.respond(f"The moderations service is being restarted...", ephemeral=True, delete_after=30) + await self.stop_moderations_service(ctx.guild_id) + await ctx.send_followup("The moderations service was stopped..", ephemeral=True, delete_after=30) + await self.start_moderations_service(ctx.guild_id, self.get_moderated_alert_channel(ctx.guild_id)) + await ctx.send_followup("The moderations service was restarted successfully.", ephemeral=True, delete_after=30) + + async def build_moderation_settings_embed(self,type, mod_set): + + embed = discord.Embed( + title="Moderation Settings", + description="The moderation settings for this guild for the type: " + type, + color=discord.Color.yellow() if type=="warn" else discord.Color.red(), + ) + + # Add each key_value pair in the mod_set to the embed, make them fairly small + for key, value in mod_set.items(): + embed.add_field(name=key, value=value, inline=False) + + return embed + + + async def config_command(self, ctx: discord.ApplicationContext, config_type: str, hate, hate_threatening, self_harm, sexual, sexual_minors, violence, violence_graphic): + config_type = config_type.lower().strip() + if config_type not in ["warn", "delete"]: + await ctx.respond("Invalid config type, please use `warn` or `delete`") + return + + all_args = [hate, hate_threatening, self_harm, sexual, sexual_minors, violence, violence_graphic] + await ctx.defer(ephemeral=True) + + # Case for printing the current config + if not any(all_args): + await ctx.respond(ephemeral=True, embed=await self.build_moderation_settings_embed(config_type, self.get_or_set_warn_set(ctx.guild_id) if config_type=="warn" else self.get_or_set_delete_set(ctx.guild_id))) + return + + if config_type == "warn": + # Check if no args were + warn_set = self.get_or_set_warn_set(ctx.guild_id) + + new_warn_set = ThresholdSet( + hate if hate else warn_set["hate"], + hate_threatening if hate_threatening else warn_set["hate/threatening"], + self_harm if self_harm else warn_set["self-harm"], + sexual if sexual else warn_set["sexual"], + sexual_minors if sexual_minors else warn_set["sexual/minors"], + violence if violence else warn_set["violence"], + violence_graphic if violence_graphic else warn_set["violence/graphic"], + ) + self.set_warn_set(ctx.guild_id, new_warn_set) + await self.restart_moderations_service(ctx) + + elif config_type == "delete": + delete_set = self.get_or_set_delete_set(ctx.guild_id) + + new_delete_set = ThresholdSet( + hate if hate else delete_set["hate"], + hate_threatening if hate_threatening else delete_set["hate/threatening"], + self_harm if self_harm else delete_set["self-harm"], + sexual if sexual else delete_set["sexual"], + sexual_minors if sexual_minors else delete_set["sexual/minors"], + violence if violence else delete_set["violence"], + violence_graphic if violence_graphic else delete_set["violence/graphic"], + ) + self.set_delete_set(ctx.guild_id, new_delete_set) + await self.restart_moderations_service(ctx) + + async def moderations_test_command( self, ctx: discord.ApplicationContext, prompt: str diff --git a/cogs/text_service_cog.py b/cogs/text_service_cog.py index 907fee3..b882923 100644 --- a/cogs/text_service_cog.py +++ b/cogs/text_service_cog.py @@ -530,7 +530,7 @@ class GPT3ComCon(discord.Cog, name="GPT3ComCon"): # Moderations service is done here. if ( - message.guild.id in Moderation.moderation_queues + hasattr(message, "guild") and message.guild.id in Moderation.moderation_queues and Moderation.moderation_queues[message.guild.id] is not None ): # Create a timestamp that is 0.5 seconds from now diff --git a/services/moderations_service.py b/services/moderations_service.py index 8420652..ab6e163 100644 --- a/services/moderations_service.py +++ b/services/moderations_service.py @@ -40,12 +40,18 @@ class ThresholdSet: v_t, vg_t, ] + # The string representation is just the keys alongside the threshold values + + def __str__(self): + # "key": value format + return ", ".join([f"{k}: {v}" for k, v in zip(self.keys, self.thresholds)]) def moderate(self, text, response_message): category_scores = response_message["results"][0]["category_scores"] flagged = response_message["results"][0]["flagged"] for category, threshold in zip(self.keys, self.thresholds): + threshold = float(threshold) if category_scores[category] > threshold: return (True, flagged) return (False, flagged) @@ -127,11 +133,9 @@ class Moderation: return embed @staticmethod - def determine_moderation_result(text, response): + def determine_moderation_result(text, response, warn_set, delete_set): # warn_set = ThresholdSet(0.005, 0.05, 0.05, 0.91, 0.1, 0.04, 0.1) # delete_set = ThresholdSet(0.26, 0.26, 0.1, 0.95, 0.03, 0.85, 0.4) - warn_set = ThresholdSet(0.01, 0.05, 0.05, 0.91, 0.1, 0.45, 0.1) - delete_set = ThresholdSet(0.26, 0.26, 0.1, 0.95, 0.03, 0.85, 0.4) warn_result, flagged_warn = warn_set.moderate(text, response) delete_result, flagged_delete = delete_set.moderate(text, response) @@ -146,8 +150,9 @@ class Moderation: # This function will be called by the bot to process the message queue @staticmethod async def process_moderation_queue( - moderation_queue, PROCESS_WAIT_TIME, EMPTY_WAIT_TIME, moderations_alert_channel + moderation_queue, PROCESS_WAIT_TIME, EMPTY_WAIT_TIME, moderations_alert_channel, warn_set, delete_set ): + print("The warn and delete sets are ", warn_set, delete_set) while True: try: # If the queue is empty, sleep for a short time before checking again @@ -164,7 +169,7 @@ class Moderation: to_moderate.message.content ) moderation_result = Moderation.determine_moderation_result( - to_moderate.message.content, response + to_moderate.message.content, response, warn_set, delete_set ) if moderation_result == ModerationResult.DELETE: From 4238c098340396211dd03d9d477edc5251a09d3c Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sun, 15 Jan 2023 00:42:46 -0500 Subject: [PATCH 2/2] bump version --- gpt3discord.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt3discord.py b/gpt3discord.py index f8a1dc6..ceb366a 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -27,7 +27,7 @@ from services.usage_service import UsageService from services.environment_service import EnvService -__version__ = "6.1" +__version__ = "6.5" """