Merge branch 'main' into rekey-conversation

Signed-off-by: Kaveen Kumarasinghe <k5kumara@uwaterloo.ca>
Kaveen Kumarasinghe 2 years ago committed by GitHub
commit 79c44fb0a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -215,7 +215,10 @@ class RedoButton(discord.ui.Button["OptimizeView"]):
"Redoing your original request...", ephemeral=True, delete_after=20 "Redoing your original request...", ephemeral=True, delete_after=20
) )
await self.converser_cog.encapsulated_send( await self.converser_cog.encapsulated_send(
user_id, prompt, ctx, response_message user_id=user_id,
prompt=prompt,
ctx=ctx,
response_message=response_message,
) )
else: else:
await interaction.response.send_message( await interaction.response.send_message(

@ -51,15 +51,21 @@ class Moderation:
return embed return embed
@staticmethod @staticmethod
def determine_moderation_result(response): def determine_moderation_result(text, response):
HATE_THRESHOLD = 0.005 HATE_THRESHOLD = 0.005
HATE_VIOLENCE_THRESHOLD = 0.05 HATE_VIOLENCE_THRESHOLD = 0.05
SELF_HARM_THRESHOLD = 0.05 SELF_HARM_THRESHOLD = 0.05
SEXUAL_THRESHOLD = 0.75 SEXUAL_THRESHOLD = 0.75
SEXUAL_MINORS_THRESHOLD = 0.1 SEXUAL_MINORS_THRESHOLD = 0.1
VIOLENCE_THRESHOLD = 0.01 VIOLENCE_THRESHOLD = 0.08
VIOLENCE_GRAPHIC_THRESHOLD = 0.1 VIOLENCE_GRAPHIC_THRESHOLD = 0.1
extreme_hatred_qualifiers = [
"i fucking hate",
"fucking hate",
"i fucking despise",
]
thresholds = [ thresholds = [
HATE_THRESHOLD, HATE_THRESHOLD,
HATE_VIOLENCE_THRESHOLD, HATE_VIOLENCE_THRESHOLD,
@ -85,6 +91,14 @@ class Moderation:
# Iterate the category scores using the threshold_iterator and compare the values to thresholds # Iterate the category scores using the threshold_iterator and compare the values to thresholds
for category, threshold in zip(threshold_iterator, thresholds): for category, threshold in zip(threshold_iterator, thresholds):
if category == "hate":
if (
"hate" in text.lower()
): # The word "hate" makes the model oversensitive. This is a (bad) workaround.
threshold = 0.1
if any(word in text.lower() for word in extreme_hatred_qualifiers):
threshold = 0.6
if category_scores[category] > threshold: if category_scores[category] > threshold:
return True return True
@ -110,7 +124,9 @@ class Moderation:
response = await model.send_moderations_request( response = await model.send_moderations_request(
to_moderate.message.content to_moderate.message.content
) )
moderation_result = Moderation.determine_moderation_result(response) moderation_result = Moderation.determine_moderation_result(
to_moderate.message.content, response
)
if moderation_result: if moderation_result:
# Take care of the flagged message # Take care of the flagged message

Loading…
Cancel
Save