parameterize moderations

Kaveen Kumarasinghe 1 year ago
parent ab362b97cd
commit 8825b9e7f3

@ -127,10 +127,17 @@ These commands are grouped, so each group has a prefix but you can easily tab co
`/mod set status:off alert_channel_id:<CHANNEL ID>` - Turn on moderations and set the alert channel to the channel ID you specify in the command.
- The bot needs Administrative permissions for this, and you need to set `MODERATIONS_ALERT_CHANNEL` to the channel ID of a desired channel in your .env file if you want to receive alerts about moderated messages.
- This uses the OpenAI Moderations endpoint to check for messages, requests are only sent to the moderations endpoint at a MINIMUM request gap of 0.5 seconds, to ensure you don't get blocked and to ensure reliability.
- The bot uses numerical thresholds to determine whether a message is toxic or not, and I have manually tested and fine tuned these thresholds to a point that I think is good, please open an issue if you have any suggestions for the thresholds!
- There are two thresholds for the bot, there are instances in which the bot will outright delete a message and an instance where the bot will send a message to the alert channel notifying admins and giving them quick options to delete and timeout the user (check out the screenshots at the beginning of the README to see this).
`/mod config type:<warn/delete> hate:# hate_threatening:# self_harm:# sexual:# sexual_minors:# violence:# violence_graphic:#`
- Set the moderation thresholds of the bot for the specific type of moderation (`warn` or `delete`). You can view the thresholds by typing just `/mod config type:<warn/delete>` without any other parameters. You don't have to set all of them, you can just set one or two items if you want. For example, to set the hate threshold for warns, you can type `/mod config type:warn hate:0.2`
- Lower values are more strict, higher values are more lenient. There are default values that I've fine tuned the service with for a general server.
The bot needs Administrative permissions for this, and you need to set `MODERATIONS_ALERT_CHANNEL` to the channel ID of a desired channel in your .env file if you want to receive alerts about moderated messages.
This uses the OpenAI Moderations endpoint to check for messages, requests are only sent to the moderations endpoint at a MINIMUM request gap of 0.5 seconds, to ensure you don't get blocked and to ensure reliability.
The bot uses numerical thresholds to determine whether a message is toxic or not, and I have manually tested and fine tuned these thresholds to a point that I think is good, please open an issue if you have any suggestions for the thresholds!
There are two thresholds for the bot, there are instances in which the bot will outright delete a message and an instance where the bot will send a message to the alert channel notifying admins and giving them quick options to delete and timeout the user (check out the screenshots at the beginning of the README to see this).
If you'd like to help us test and fine tune our thresholds for the moderation service, please join this test server: https://discord.gg/CWhsSgNdrP. You can let off some steam in a controlled environment ;)

@ -180,6 +180,59 @@ class Commands(discord.Cog, name="Commands"):
):
await self.moderations_cog.moderations_command(ctx, status, alert_channel_id)
@add_to_group("mod")
@discord.slash_command(
name="config",
description="Configure the moderations service for the current guild. Lower # = more strict",
guild_ids=ALLOWED_GUILDS,
)
@discord.option(
name="type",
description="The type of moderation to configure ('warn' or 'delete')",
required=True,
)
@discord.option(
name="hate",
description="The threshold for hate speech",
required=False,
)
@discord.option(
name="hate_threatening",
description="The threshold for hate/threatening speech",
required=False,
)
@discord.option(
name="self_harm",
description="The threshold for self_harm speech",
required=False,
)
@discord.option(
name="sexual",
description="The threshold for sexual speech",
required=False,
)
@discord.option(
name="sexual_minors",
description="The threshold for sexual speech with minors in context",
required=False,
)
@discord.option(
name="violence",
description="The threshold for violent speech",
required=False,
)
@discord.option(
name="violence_graphic",
description="The threshold for violent and graphic speech",
required=False,
)
@discord.guild_only()
async def config(
self, ctx: discord.ApplicationContext, type: str, hate: str, hate_threatening: str, self_harm: str, sexual: str, sexual_minors: str, violence: str, violence_graphic: str
):
await self.moderations_cog.config_command(ctx, type, hate, hate_threatening, self_harm, sexual, sexual_minors, violence, violence_graphic)
"""
GPT commands
"""

@ -4,7 +4,7 @@ import discord
from sqlitedict import SqliteDict
from services.environment_service import EnvService
from services.moderations_service import Moderation
from services.moderations_service import Moderation, ThresholdSet
MOD_DB = None
try:
@ -34,11 +34,18 @@ class ModerationsService(discord.Cog, name="ModerationsService"):
self.moderation_tasks = {}
self.moderations_launched = []
# Defaults
self.default_warn_set = ThresholdSet(0.01, 0.05, 0.05, 0.91, 0.1, 0.45, 0.1)
self.default_delete_set = ThresholdSet(0.26, 0.26, 0.1, 0.95, 0.03, 0.85, 0.4)
@discord.Cog.listener()
async def on_ready(self):
# Check moderation service for each guild
for guild in self.bot.guilds:
self.get_or_set_warn_set(guild.id)
self.get_or_set_delete_set(guild.id)
await self.check_and_launch_moderations(guild.id)
print("The moderation service is ready.")
def check_guild_moderated(self, guild_id):
return guild_id in MOD_DB and MOD_DB[guild_id]["moderated"]
@ -50,6 +57,35 @@ class ModerationsService(discord.Cog, name="ModerationsService"):
MOD_DB[guild_id] = {"moderated": True, "alert_channel": channel_id}
MOD_DB.commit()
def get_or_set_warn_set(self, guild_id):
guild_id = str(guild_id)
key = guild_id + "_warn_set"
if key not in MOD_DB:
MOD_DB[key] = zip(self.default_warn_set.keys, self.default_warn_set.thresholds)
MOD_DB.commit()
return dict(MOD_DB[key])
def get_or_set_delete_set(self, guild_id):
guild_id=str(guild_id)
key = guild_id + "_delete_set"
if key not in MOD_DB:
MOD_DB[key] = zip(self.default_delete_set.keys, self.default_delete_set.thresholds)
MOD_DB.commit()
return dict(MOD_DB[key])
def set_warn_set(self, guild_id, threshold_set):
guild_id = str(guild_id)
key = guild_id + "_warn_set"
MOD_DB[key] = zip(threshold_set.keys, threshold_set.thresholds)
MOD_DB.commit()
def set_delete_set(self, guild_id, threshold_set):
guild_id = str(guild_id)
key = guild_id + "_delete_set"
MOD_DB[key] = zip(threshold_set.keys, threshold_set.thresholds)
MOD_DB.commit()
def set_guild_moderated(self, guild_id, status=True):
if guild_id not in MOD_DB:
MOD_DB[guild_id] = {"moderated": status, "alert_channel": 0}
@ -72,10 +108,15 @@ class ModerationsService(discord.Cog, name="ModerationsService"):
if not alert_channel_override
else alert_channel_override
)
warn_set_nums = self.get_or_set_warn_set(guild_id).values()
delete_set_nums = self.get_or_set_delete_set(guild_id).values()
warn_set = ThresholdSet(*warn_set_nums)
delete_set = ThresholdSet(*delete_set_nums)
Moderation.moderation_tasks[guild_id] = asyncio.ensure_future(
Moderation.process_moderation_queue(
Moderation.moderation_queues[guild_id], 1, 1, moderations_channel
Moderation.moderation_queues[guild_id], 1, 1, moderations_channel,
warn_set, delete_set
)
)
print("Launched the moderations service for guild " + str(guild_id))
@ -101,25 +142,100 @@ class ModerationsService(discord.Cog, name="ModerationsService"):
return
# Create the moderations service.
self.set_guild_moderated(ctx.guild_id)
moderations_channel = await self.check_and_launch_moderations(
ctx.guild_id,
Moderation.moderation_alerts_channel
if not alert_channel_id
else alert_channel_id,
)
self.set_moderated_alert_channel(ctx.guild_id, moderations_channel.id)
await ctx.respond("Moderations service enabled")
await self.start_moderations_service(guild_id=ctx.guild_id, alert_channel_id=alert_channel_id)
await ctx.respond("Moderations is now enabled for this guild")
elif status == "off":
# Cancel the moderations service.
self.set_guild_moderated(ctx.guild_id, False)
Moderation.moderation_tasks[ctx.guild_id].cancel()
Moderation.moderation_tasks[ctx.guild_id] = None
Moderation.moderation_queues[ctx.guild_id] = None
Moderation.moderations_launched.remove(ctx.guild_id)
await ctx.respond("Moderations service disabled")
await self.stop_moderations_service(ctx.guild_id)
await ctx.respond("Moderations is now disabled for this guild", ephemeral=True)
async def stop_moderations_service(self, guild_id):
self.set_guild_moderated(guild_id, False)
Moderation.moderation_tasks[guild_id].cancel()
Moderation.moderation_tasks[guild_id] = None
Moderation.moderation_queues[guild_id] = None
Moderation.moderations_launched.remove(guild_id)
async def start_moderations_service(self, guild_id, alert_channel_id=None):
self.set_guild_moderated(guild_id)
moderations_channel = await self.check_and_launch_moderations(
guild_id,
Moderation.moderation_alerts_channel
if not alert_channel_id
else alert_channel_id,
)
self.set_moderated_alert_channel(guild_id, moderations_channel.id)
async def restart_moderations_service(self, ctx):
await ctx.respond(f"The moderations service is being restarted...", ephemeral=True, delete_after=30)
await self.stop_moderations_service(ctx.guild_id)
await ctx.send_followup("The moderations service was stopped..", ephemeral=True, delete_after=30)
await self.start_moderations_service(ctx.guild_id, self.get_moderated_alert_channel(ctx.guild_id))
await ctx.send_followup("The moderations service was restarted successfully.", ephemeral=True, delete_after=30)
async def build_moderation_settings_embed(self,type, mod_set):
embed = discord.Embed(
title="Moderation Settings",
description="The moderation settings for this guild for the type: " + type,
color=discord.Color.yellow() if type=="warn" else discord.Color.red(),
)
# Add each key_value pair in the mod_set to the embed, make them fairly small
for key, value in mod_set.items():
embed.add_field(name=key, value=value, inline=False)
return embed
async def config_command(self, ctx: discord.ApplicationContext, config_type: str, hate, hate_threatening, self_harm, sexual, sexual_minors, violence, violence_graphic):
config_type = config_type.lower().strip()
if config_type not in ["warn", "delete"]:
await ctx.respond("Invalid config type, please use `warn` or `delete`")
return
all_args = [hate, hate_threatening, self_harm, sexual, sexual_minors, violence, violence_graphic]
await ctx.defer(ephemeral=True)
# Case for printing the current config
if not any(all_args):
await ctx.respond(ephemeral=True, embed=await self.build_moderation_settings_embed(config_type, self.get_or_set_warn_set(ctx.guild_id) if config_type=="warn" else self.get_or_set_delete_set(ctx.guild_id)))
return
if config_type == "warn":
# Check if no args were
warn_set = self.get_or_set_warn_set(ctx.guild_id)
new_warn_set = ThresholdSet(
hate if hate else warn_set["hate"],
hate_threatening if hate_threatening else warn_set["hate/threatening"],
self_harm if self_harm else warn_set["self-harm"],
sexual if sexual else warn_set["sexual"],
sexual_minors if sexual_minors else warn_set["sexual/minors"],
violence if violence else warn_set["violence"],
violence_graphic if violence_graphic else warn_set["violence/graphic"],
)
self.set_warn_set(ctx.guild_id, new_warn_set)
await self.restart_moderations_service(ctx)
elif config_type == "delete":
delete_set = self.get_or_set_delete_set(ctx.guild_id)
new_delete_set = ThresholdSet(
hate if hate else delete_set["hate"],
hate_threatening if hate_threatening else delete_set["hate/threatening"],
self_harm if self_harm else delete_set["self-harm"],
sexual if sexual else delete_set["sexual"],
sexual_minors if sexual_minors else delete_set["sexual/minors"],
violence if violence else delete_set["violence"],
violence_graphic if violence_graphic else delete_set["violence/graphic"],
)
self.set_delete_set(ctx.guild_id, new_delete_set)
await self.restart_moderations_service(ctx)
async def moderations_test_command(
self, ctx: discord.ApplicationContext, prompt: str

@ -530,7 +530,7 @@ class GPT3ComCon(discord.Cog, name="GPT3ComCon"):
# Moderations service is done here.
if (
message.guild.id in Moderation.moderation_queues
hasattr(message, "guild") and message.guild.id in Moderation.moderation_queues
and Moderation.moderation_queues[message.guild.id] is not None
):
# Create a timestamp that is 0.5 seconds from now

@ -40,12 +40,18 @@ class ThresholdSet:
v_t,
vg_t,
]
# The string representation is just the keys alongside the threshold values
def __str__(self):
# "key": value format
return ", ".join([f"{k}: {v}" for k, v in zip(self.keys, self.thresholds)])
def moderate(self, text, response_message):
category_scores = response_message["results"][0]["category_scores"]
flagged = response_message["results"][0]["flagged"]
for category, threshold in zip(self.keys, self.thresholds):
threshold = float(threshold)
if category_scores[category] > threshold:
return (True, flagged)
return (False, flagged)
@ -127,11 +133,9 @@ class Moderation:
return embed
@staticmethod
def determine_moderation_result(text, response):
def determine_moderation_result(text, response, warn_set, delete_set):
# warn_set = ThresholdSet(0.005, 0.05, 0.05, 0.91, 0.1, 0.04, 0.1)
# delete_set = ThresholdSet(0.26, 0.26, 0.1, 0.95, 0.03, 0.85, 0.4)
warn_set = ThresholdSet(0.01, 0.05, 0.05, 0.91, 0.1, 0.45, 0.1)
delete_set = ThresholdSet(0.26, 0.26, 0.1, 0.95, 0.03, 0.85, 0.4)
warn_result, flagged_warn = warn_set.moderate(text, response)
delete_result, flagged_delete = delete_set.moderate(text, response)
@ -146,8 +150,9 @@ class Moderation:
# This function will be called by the bot to process the message queue
@staticmethod
async def process_moderation_queue(
moderation_queue, PROCESS_WAIT_TIME, EMPTY_WAIT_TIME, moderations_alert_channel
moderation_queue, PROCESS_WAIT_TIME, EMPTY_WAIT_TIME, moderations_alert_channel, warn_set, delete_set
):
print("The warn and delete sets are ", warn_set, delete_set)
while True:
try:
# If the queue is empty, sleep for a short time before checking again
@ -164,7 +169,7 @@ class Moderation:
to_moderate.message.content
)
moderation_result = Moderation.determine_moderation_result(
to_moderate.message.content, response
to_moderate.message.content, response, warn_set, delete_set
)
if moderation_result == ModerationResult.DELETE:

Loading…
Cancel
Save