|
|
|
@ -1,70 +1,70 @@
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.Linq;
|
|
|
|
|
using System.Text.RegularExpressions;
|
|
|
|
|
using DiscordChatExporter.Core.Markdown.Ast;
|
|
|
|
|
using DiscordChatExporter.Core.Markdown.Internal;
|
|
|
|
|
using DiscordChatExporter.Domain.Markdown.Ast;
|
|
|
|
|
using DiscordChatExporter.Domain.Markdown.Matching;
|
|
|
|
|
|
|
|
|
|
namespace DiscordChatExporter.Core.Markdown
|
|
|
|
|
namespace DiscordChatExporter.Domain.Markdown
|
|
|
|
|
{
|
|
|
|
|
// The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
|
|
|
|
|
public static class MarkdownParser
|
|
|
|
|
internal static partial class MarkdownParser
|
|
|
|
|
{
|
|
|
|
|
private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.Multiline;
|
|
|
|
|
|
|
|
|
|
/* Formatting */
|
|
|
|
|
|
|
|
|
|
// Capture any character until the earliest double asterisk not followed by an asterisk
|
|
|
|
|
private static readonly IMatcher<Node> BoldFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> BoldFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Bold, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
|
// Capture any character until the earliest single asterisk not preceded or followed by an asterisk
|
|
|
|
|
// Opening asterisk must not be followed by whitespace
|
|
|
|
|
// Closing asterisk must not be preceded by whitespace
|
|
|
|
|
private static readonly IMatcher<Node> ItalicFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> ItalicFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
|
// Capture any character until the earliest triple asterisk not followed by an asterisk
|
|
|
|
|
private static readonly IMatcher<Node> ItalicBoldFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> ItalicBoldFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattedNodeMatcher)));
|
|
|
|
|
|
|
|
|
|
// Capture any character except underscore until an underscore
|
|
|
|
|
// Closing underscore must not be followed by a word character
|
|
|
|
|
private static readonly IMatcher<Node> ItalicAltFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> ItalicAltFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
|
// Capture any character until the earliest double underscore not followed by an underscore
|
|
|
|
|
private static readonly IMatcher<Node> UnderlineFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> UnderlineFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Underline, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
|
// Capture any character until the earliest triple underscore not followed by an underscore
|
|
|
|
|
private static readonly IMatcher<Node> ItalicUnderlineFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]), UnderlineFormattedNodeMatcher)));
|
|
|
|
|
|
|
|
|
|
// Capture any character until the earliest double tilde
|
|
|
|
|
private static readonly IMatcher<Node> StrikethroughFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> StrikethroughFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Strikethrough, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
|
// Capture any character until the earliest double pipe
|
|
|
|
|
private static readonly IMatcher<Node> SpoilerFormattedNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> SpoilerFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Spoiler, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
|
// Capture any character until the end of the line
|
|
|
|
|
// Opening 'greater than' character must be followed by whitespace
|
|
|
|
|
private static readonly IMatcher<Node> SingleLineQuoteNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> SingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("^>\\s(.+\n?)", DefaultRegexOptions),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
|
// Repeatedly capture any character until the end of the line
|
|
|
|
|
// This one is tricky as it ends up producing multiple separate captures which need to be joined
|
|
|
|
|
private static readonly IMatcher<Node> RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("(?:^>\\s(.+\n?)){2,}", DefaultRegexOptions),
|
|
|
|
|
(p, m) =>
|
|
|
|
|
{
|
|
|
|
@ -74,7 +74,7 @@ namespace DiscordChatExporter.Core.Markdown
|
|
|
|
|
|
|
|
|
|
// Capture any character until the end of the input
|
|
|
|
|
// Opening 'greater than' characters must be followed by whitespace
|
|
|
|
|
private static readonly IMatcher<Node> MultiLineQuoteNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> MultiLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("^>>>\\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
(p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1]))));
|
|
|
|
|
|
|
|
|
@ -82,41 +82,42 @@ namespace DiscordChatExporter.Core.Markdown
|
|
|
|
|
|
|
|
|
|
// Capture any character except backtick until a backtick
|
|
|
|
|
// Blank lines at the beginning and end of content are trimmed
|
|
|
|
|
private static readonly IMatcher<Node> InlineCodeBlockNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
// There can be either one or two backticks, but equal number on both sides
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> InlineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("`([^`]+)`", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
m => new InlineCodeBlockNode(m.Groups[1].Value.Trim('\r', '\n')));
|
|
|
|
|
|
|
|
|
|
// Capture language identifier and then any character until the earliest triple backtick
|
|
|
|
|
// Language identifier is one word immediately after opening backticks, followed immediately by newline
|
|
|
|
|
// Blank lines at the beginning and end of content are trimmed
|
|
|
|
|
private static readonly IMatcher<Node> MultiLineCodeBlockNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> MultiLineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
|
|
|
|
|
m => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n')));
|
|
|
|
|
|
|
|
|
|
/* Mentions */
|
|
|
|
|
|
|
|
|
|
// Capture @everyone
|
|
|
|
|
private static readonly IMatcher<Node> EveryoneMentionNodeMatcher = new StringMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> EveryoneMentionNodeMatcher = new StringMatcher<MarkdownNode>(
|
|
|
|
|
"@everyone",
|
|
|
|
|
p => new MentionNode("everyone", MentionType.Meta));
|
|
|
|
|
|
|
|
|
|
// Capture @here
|
|
|
|
|
private static readonly IMatcher<Node> HereMentionNodeMatcher = new StringMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> HereMentionNodeMatcher = new StringMatcher<MarkdownNode>(
|
|
|
|
|
"@here",
|
|
|
|
|
p => new MentionNode("here", MentionType.Meta));
|
|
|
|
|
|
|
|
|
|
// Capture <@123456> or <@!123456>
|
|
|
|
|
private static readonly IMatcher<Node> UserMentionNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> UserMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("<@!?(\\d+)>", DefaultRegexOptions),
|
|
|
|
|
m => new MentionNode(m.Groups[1].Value, MentionType.User));
|
|
|
|
|
|
|
|
|
|
// Capture <#123456>
|
|
|
|
|
private static readonly IMatcher<Node> ChannelMentionNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> ChannelMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("<#(\\d+)>", DefaultRegexOptions),
|
|
|
|
|
m => new MentionNode(m.Groups[1].Value, MentionType.Channel));
|
|
|
|
|
|
|
|
|
|
// Capture <@&123456>
|
|
|
|
|
private static readonly IMatcher<Node> RoleMentionNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> RoleMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("<@&(\\d+)>", DefaultRegexOptions),
|
|
|
|
|
m => new MentionNode(m.Groups[1].Value, MentionType.Role));
|
|
|
|
|
|
|
|
|
@ -127,29 +128,29 @@ namespace DiscordChatExporter.Core.Markdown
|
|
|
|
|
// ... or surrogate pair
|
|
|
|
|
// ... or digit followed by enclosing mark
|
|
|
|
|
// (this does not match all emojis in Discord but it's reasonably accurate enough)
|
|
|
|
|
private static readonly IMatcher<Node> StandardEmojiNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> StandardEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|[\\u2600-\\u26FF]|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions),
|
|
|
|
|
m => new EmojiNode(m.Groups[1].Value));
|
|
|
|
|
|
|
|
|
|
// Capture <:lul:123456> or <a:lul:123456>
|
|
|
|
|
private static readonly IMatcher<Node> CustomEmojiNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> CustomEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions),
|
|
|
|
|
m => new EmojiNode(m.Groups[3].Value, m.Groups[2].Value, !string.IsNullOrWhiteSpace(m.Groups[1].Value)));
|
|
|
|
|
|
|
|
|
|
/* Links */
|
|
|
|
|
|
|
|
|
|
// Capture [title](link)
|
|
|
|
|
private static readonly IMatcher<Node> TitledLinkNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> TitledLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions),
|
|
|
|
|
m => new LinkNode(m.Groups[2].Value, m.Groups[1].Value));
|
|
|
|
|
|
|
|
|
|
// Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace
|
|
|
|
|
private static readonly IMatcher<Node> AutoLinkNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> AutoLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions),
|
|
|
|
|
m => new LinkNode(m.Groups[1].Value));
|
|
|
|
|
|
|
|
|
|
// Same as auto link but also surrounded by angular brackets
|
|
|
|
|
private static readonly IMatcher<Node> HiddenLinkNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> HiddenLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions),
|
|
|
|
|
m => new LinkNode(m.Groups[1].Value));
|
|
|
|
|
|
|
|
|
@ -157,31 +158,31 @@ namespace DiscordChatExporter.Core.Markdown
|
|
|
|
|
|
|
|
|
|
// Capture the shrug emoticon
|
|
|
|
|
// This escapes it from matching for formatting
|
|
|
|
|
private static readonly IMatcher<Node> ShrugTextNodeMatcher = new StringMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> ShrugTextNodeMatcher = new StringMatcher<MarkdownNode>(
|
|
|
|
|
@"¯\_(ツ)_/¯",
|
|
|
|
|
p => new TextNode(p.ToString()));
|
|
|
|
|
|
|
|
|
|
// Capture some specific emojis that don't get rendered
|
|
|
|
|
// This escapes it from matching for emoji
|
|
|
|
|
private static readonly IMatcher<Node> IgnoredEmojiTextNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> IgnoredEmojiTextNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions),
|
|
|
|
|
m => new TextNode(m.Groups[1].Value));
|
|
|
|
|
|
|
|
|
|
// Capture any "symbol/other" character or surrogate pair preceded by a backslash
|
|
|
|
|
// This escapes it from matching for emoji
|
|
|
|
|
private static readonly IMatcher<Node> EscapedSymbolTextNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> EscapedSymbolTextNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions),
|
|
|
|
|
m => new TextNode(m.Groups[1].Value));
|
|
|
|
|
|
|
|
|
|
// Capture any non-whitespace, non latin alphanumeric character preceded by a backslash
|
|
|
|
|
// This escapes it from matching for formatting or other tokens
|
|
|
|
|
private static readonly IMatcher<Node> EscapedCharacterTextNodeMatcher = new RegexMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> EscapedCharacterTextNodeMatcher = new RegexMatcher<MarkdownNode>(
|
|
|
|
|
new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions),
|
|
|
|
|
m => new TextNode(m.Groups[1].Value));
|
|
|
|
|
|
|
|
|
|
// Combine all matchers into one
|
|
|
|
|
// Matchers that have similar patterns are ordered from most specific to least specific
|
|
|
|
|
private static readonly IMatcher<Node> AggregateNodeMatcher = new AggregateMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> AggregateNodeMatcher = new AggregateMatcher<MarkdownNode>(
|
|
|
|
|
// Escaped text
|
|
|
|
|
ShrugTextNodeMatcher,
|
|
|
|
|
IgnoredEmojiTextNodeMatcher,
|
|
|
|
@ -223,7 +224,7 @@ namespace DiscordChatExporter.Core.Markdown
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Minimal set of matchers for non-multimedia formats (e.g. plain text)
|
|
|
|
|
private static readonly IMatcher<Node> MinimalAggregateNodeMatcher = new AggregateMatcher<Node>(
|
|
|
|
|
private static readonly IMatcher<MarkdownNode> MinimalAggregateNodeMatcher = new AggregateMatcher<MarkdownNode>(
|
|
|
|
|
// Mentions
|
|
|
|
|
EveryoneMentionNodeMatcher,
|
|
|
|
|
HereMentionNodeMatcher,
|
|
|
|
@ -235,15 +236,21 @@ namespace DiscordChatExporter.Core.Markdown
|
|
|
|
|
CustomEmojiNodeMatcher
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
private static IReadOnlyList<Node> Parse(StringPart stringPart, IMatcher<Node> matcher) =>
|
|
|
|
|
matcher.MatchAll(stringPart, p => new TextNode(p.ToString())).Select(r => r.Value).ToArray();
|
|
|
|
|
private static IReadOnlyList<MarkdownNode> Parse(StringPart stringPart, IMatcher<MarkdownNode> matcher) =>
|
|
|
|
|
matcher
|
|
|
|
|
.MatchAll(stringPart, p => new TextNode(p.ToString()))
|
|
|
|
|
.Select(r => r.Value)
|
|
|
|
|
.ToArray();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static IReadOnlyList<Node> Parse(StringPart stringPart) => Parse(stringPart, AggregateNodeMatcher);
|
|
|
|
|
internal static partial class MarkdownParser
|
|
|
|
|
{
|
|
|
|
|
private static IReadOnlyList<MarkdownNode> Parse(StringPart stringPart) => Parse(stringPart, AggregateNodeMatcher);
|
|
|
|
|
|
|
|
|
|
private static IReadOnlyList<Node> ParseMinimal(StringPart stringPart) => Parse(stringPart, MinimalAggregateNodeMatcher);
|
|
|
|
|
private static IReadOnlyList<MarkdownNode> ParseMinimal(StringPart stringPart) => Parse(stringPart, MinimalAggregateNodeMatcher);
|
|
|
|
|
|
|
|
|
|
public static IReadOnlyList<Node> Parse(string input) => Parse(new StringPart(input));
|
|
|
|
|
public static IReadOnlyList<MarkdownNode> Parse(string input) => Parse(new StringPart(input));
|
|
|
|
|
|
|
|
|
|
public static IReadOnlyList<Node> ParseMinimal(string input) => ParseMinimal(new StringPart(input));
|
|
|
|
|
public static IReadOnlyList<MarkdownNode> ParseMinimal(string input) => ParseMinimal(new StringPart(input));
|
|
|
|
|
}
|
|
|
|
|
}
|