Upgrade moderations service, double thresholds, admin options

Kaveen Kumarasinghe 2 years ago
parent 5c556c9952
commit 7b42bc4afe

@ -16,6 +16,7 @@
<img src="https://i.imgur.com/KeLpDgj.png"/>
<img src="https://i.imgur.com/jLp1T0h.png"/>
<img src="https://i.imgur.com/9XC95Lu.png"/>
<img src="https://i.imgur.com/HqFSFcc.png"/>
</p>
@ -118,6 +119,7 @@ These commands are grouped, so each group has a prefix but you can easily tab co
- The bot needs Administrative permissions for this, and you need to set `MODERATIONS_ALERT_CHANNEL` to the channel ID of a desired channel in your .env file if you want to receive alerts about moderated messages.
- This uses the OpenAI Moderations endpoint to check for messages, requests are only sent to the moderations endpoint at a MINIMUM request gap of 0.5 seconds, to ensure you don't get blocked and to ensure reliability.
- The bot uses numerical thresholds to determine whether a message is toxic or not, and I have manually tested and fine tuned these thresholds to a point that I think is good, please open an issue if you have any suggestions for the thresholds!
- There are two thresholds for the bot, there are instances in which the bot will outright delete a message and an instance where the bot will send a message to the alert channel notifying admins and giving them quick options to delete and timeout the user (check out the screenshots at the beginning of the README to see this).
# Permanent Memory
Permanent memory has now been implemented into the bot, using the OpenAI Ada embeddings endpoint, and Pinecone DB.

@ -24,7 +24,7 @@ from models.openai_model import Model
from models.usage_service_model import UsageService
from models.env_service_model import EnvService
__version__ = "4.1"
__version__ = "4.2"
"""
The pinecone service is used to store and retrieve conversation embeddings.

@ -1,7 +1,7 @@
import asyncio
import os
import traceback
from datetime import datetime
from datetime import datetime, timedelta
from pathlib import Path
import discord
@ -12,6 +12,42 @@ from models.usage_service_model import UsageService
usage_service = UsageService(Path(os.environ.get("DATA_DIR", os.getcwd())))
model = Model(usage_service)
class ModerationResult:
WARN = "warn"
DELETE = "delete"
NONE = "none"
class ThresholdSet:
def __init__(self, h_t, hv_t, sh_t, s_t, sm_t, v_t, vg_t):
self.keys = [
"hate",
"hate/threatening",
"self-harm",
"sexual",
"sexual/minors",
"violence",
"violence/graphic"
]
self.thresholds = [
h_t,
hv_t,
sh_t,
s_t,
sm_t,
v_t,
vg_t,
]
def moderate(self, text, response_message):
category_scores = response_message["results"][0]["category_scores"]
flagged = response_message["results"][0]["flagged"]
for category, threshold in zip(self.keys, self.thresholds):
if category_scores[category] > threshold:
return (True, flagged)
return (False, flagged)
class Moderation:
def __init__(self, message, timestamp):
@ -33,6 +69,24 @@ class Moderation:
)
return embed
@staticmethod
def build_admin_warning_message(moderated_message, deleted_message=None, timed_out=None):
embed = discord.Embed(
title="Potentially unwanted message in the "
+ moderated_message.guild.name
+ " server",
description=f"**Message from {moderated_message.author.mention}:** {moderated_message.content}",
colour=discord.Colour.yellow(),
)
link = f"https://discord.com/channels/{moderated_message.guild.id}/{moderated_message.channel.id}/{moderated_message.id}"
embed.add_field(name="Message link", value=link, inline=False)
if deleted_message:
embed.add_field(name="Message deleted by: ", value=deleted_message, inline=False)
if timed_out:
embed.add_field(name="User timed out by: ", value=timed_out, inline=False)
return embed
@staticmethod
def build_admin_moderated_message(moderated_message, response_message):
@ -52,57 +106,20 @@ class Moderation:
@staticmethod
def determine_moderation_result(text, response):
HATE_THRESHOLD = 0.005
HATE_VIOLENCE_THRESHOLD = 0.05
SELF_HARM_THRESHOLD = 0.05
SEXUAL_THRESHOLD = 0.91
SEXUAL_MINORS_THRESHOLD = 0.1
VIOLENCE_THRESHOLD = 0.08
VIOLENCE_GRAPHIC_THRESHOLD = 0.1
extreme_hatred_qualifiers = [
"i fucking hate",
"fucking hate",
"i fucking despise",
]
warn_set = ThresholdSet(0.005,0.05,0.05,0.91,0.1,0.08,0.1)
delete_set = ThresholdSet(0.1,0.1,0.1,0.95,0.2,0.6,0.4)
thresholds = [
HATE_THRESHOLD,
HATE_VIOLENCE_THRESHOLD,
SELF_HARM_THRESHOLD,
SEXUAL_THRESHOLD,
SEXUAL_MINORS_THRESHOLD,
VIOLENCE_THRESHOLD,
VIOLENCE_GRAPHIC_THRESHOLD,
]
threshold_iterator = [
"hate",
"hate/threatening",
"self-harm",
"sexual",
"sexual/minors",
"violence",
"violence/graphic",
]
category_scores = response["results"][0]["category_scores"]
flagged = response["results"][0]["flagged"]
warn_result, flagged_warn = warn_set.moderate(text, response)
delete_result, flagged_delete = delete_set.moderate(text, response)
# Iterate the category scores using the threshold_iterator and compare the values to thresholds
for category, threshold in zip(threshold_iterator, thresholds):
if category == "hate":
if (
"hate" in text.lower()
): # The word "hate" makes the model oversensitive. This is a (bad) workaround.
threshold = 0.1
if any(word in text.lower() for word in extreme_hatred_qualifiers):
threshold = 0.6
if delete_result:
return ModerationResult.DELETE
elif warn_result:
return ModerationResult.WARN
else:
return ModerationResult.NONE
if category_scores[category] > threshold:
return True
return False
# This function will be called by the bot to process the message queue
@staticmethod
@ -128,7 +145,7 @@ class Moderation:
to_moderate.message.content, response
)
if moderation_result:
if moderation_result == ModerationResult.DELETE:
# Take care of the flagged message
response_message = await to_moderate.message.reply(
embed=Moderation.build_moderation_embed()
@ -143,6 +160,11 @@ class Moderation:
to_moderate, response_message
)
)
elif moderation_result == ModerationResult.WARN:
response_message = await moderations_alert_channel.send(
embed=Moderation.build_admin_warning_message(to_moderate.message),
)
await response_message.edit(view=ModerationAdminView(to_moderate.message, response_message))
else:
await moderation_queue.put(to_moderate)
@ -152,3 +174,63 @@ class Moderation:
except:
traceback.print_exc()
pass
class ModerationAdminView(discord.ui.View):
def __init__(self, message, moderation_message, nodelete=False):
super().__init__(timeout=None) # 1 hour interval to redo.
self.message = message
self.moderation_message = moderation_message,
if not nodelete:
self.add_item(DeleteMessageButton(self.message, self.moderation_message))
self.add_item(TimeoutUserButton(self.message, self.moderation_message, 1, nodelete))
self.add_item(TimeoutUserButton(self.message, self.moderation_message, 6, nodelete))
self.add_item(TimeoutUserButton(self.message, self.moderation_message, 12, nodelete))
self.add_item(TimeoutUserButton(self.message, self.moderation_message, 24, nodelete))
class DeleteMessageButton(discord.ui.Button["ModerationAdminView"]):
def __init__(self, message, moderation_message):
super().__init__(style=discord.ButtonStyle.danger, label="Delete Message")
self.message = message
self.moderation_message = moderation_message
async def callback(self, interaction: discord.Interaction):
# Get the user
await self.message.delete()
await interaction.response.send_message(
"This message was deleted", ephemeral=True, delete_after=10
)
await self.moderation_message[0].edit(embed=Moderation.build_admin_warning_message(self.message, deleted_message=interaction.user.mention),
view=ModerationAdminView(self.message, self.moderation_message, nodelete=True))
class TimeoutUserButton(discord.ui.Button["ModerationAdminView"]):
def __init__(self, message, moderation_message, hours, nodelete):
super().__init__(style=discord.ButtonStyle.danger, label=f"Timeout {hours}h")
self.message = message
self.moderation_message = moderation_message
self.hours = hours
self.nodelete = nodelete
async def callback(self, interaction: discord.Interaction):
# Get the user id
try:
await self.message.delete()
except:
pass
try:
await self.message.author.timeout(until = discord.utils.utcnow() + timedelta(hours=self.hours), reason="Breaking the server chat rules")
except Exception as e:
traceback.print_exc()
pass
await interaction.response.send_message(
f"This user was timed out for {self.hours} hour(s)", ephemeral=True, delete_after=10
)
moderation_message = self.moderation_message[0][0] if self.nodelete else self.moderation_message[0]
await moderation_message.edit(embed=Moderation.build_admin_warning_message(self.message, deleted_message=interaction.user.mention, timed_out=interaction.user.mention),
view=ModerationAdminView(self.message, self.moderation_message, nodelete=True))

Loading…
Cancel
Save