From 0cf6281c1950c48fdc3f08d2cbfefd1c1a37e199 Mon Sep 17 00:00:00 2001 From: Tyrrrz <1935960+Tyrrrz@users.noreply.github.com> Date: Mon, 30 Jan 2023 07:25:32 +0200 Subject: [PATCH] Clean up the markdown parser --- .../Markdown/Parsing/MarkdownParser.cs | 216 ++++++++++-------- 1 file changed, 127 insertions(+), 89 deletions(-) diff --git a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs index 610aa89..612443f 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs @@ -5,6 +5,7 @@ using System.Linq; using System.Text.RegularExpressions; using DiscordChatExporter.Core.Discord; using DiscordChatExporter.Core.Utils; +using DiscordChatExporter.Core.Utils.Extensions; namespace DiscordChatExporter.Core.Markdown.Parsing; @@ -16,169 +17,206 @@ internal static partial class MarkdownParser { private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | + RegexOptions.IgnorePatternWhitespace | RegexOptions.CultureInvariant | RegexOptions.Multiline; /* Formatting */ - // Capture any character until the earliest double asterisk not followed by an asterisk private static readonly IMatcher BoldFormattingNodeMatcher = new RegexMatcher( - new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character until the earliest double asterisk not followed by an asterisk + new Regex(@"\*\*(.+?)\*\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline), (s, m) => new FormattingNode(FormattingKind.Bold, Parse(s.Relocate(m.Groups[1]))) ); - // Capture any character until the earliest single asterisk not preceded or followed by an asterisk - // Opening asterisk must not be followed by whitespace - // Closing asterisk must not be preceded by whitespace private static readonly IMatcher ItalicFormattingNodeMatcher = new RegexMatcher( - new Regex("\\*(?!\\s)(.+?)(? new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]))) ); - // Capture any character until the earliest triple asterisk not followed by an asterisk private static readonly IMatcher ItalicBoldFormattingNodeMatcher = new RegexMatcher( - new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character until the earliest triple asterisk not followed by an asterisk + new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline), (s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher)) ); - // Capture any character except underscore until an underscore - // Closing underscore must not be followed by a word character private static readonly IMatcher ItalicAltFormattingNodeMatcher = new RegexMatcher( - new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character except underscore until an underscore + // Closing underscore must not be followed by a word character + new Regex(@"_([^_]+)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline), (s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]))) ); - // Capture any character until the earliest double underscore not followed by an underscore private static readonly IMatcher UnderlineFormattingNodeMatcher = new RegexMatcher( - new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character until the earliest double underscore not followed by an underscore + new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), (s, m) => new FormattingNode(FormattingKind.Underline, Parse(s.Relocate(m.Groups[1]))) ); - // Capture any character until the earliest triple underscore not followed by an underscore private static readonly IMatcher ItalicUnderlineFormattingNodeMatcher = new RegexMatcher( - new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => new FormattingNode(FormattingKind.Italic, - Parse(s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher)) + // Capture any character until the earliest triple underscore not followed by an underscore + new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), + (s, m) => new FormattingNode( + FormattingKind.Italic, + Parse(s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher) + ) ); - // Capture any character until the earliest double tilde private static readonly IMatcher StrikethroughFormattingNodeMatcher = new RegexMatcher( - new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character until the earliest double tilde + new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), (s, m) => new FormattingNode(FormattingKind.Strikethrough, Parse(s.Relocate(m.Groups[1]))) ); - // Capture any character until the earliest double pipe private static readonly IMatcher SpoilerFormattingNodeMatcher = new RegexMatcher( - new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character until the earliest double pipe + new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline), (s, m) => new FormattingNode(FormattingKind.Spoiler, Parse(s.Relocate(m.Groups[1]))) ); - // Capture any character until the end of the line - // Opening 'greater than' character must be followed by whitespace - // Text content is optional private static readonly IMatcher SingleLineQuoteNodeMatcher = new RegexMatcher( - new Regex("^>\\s(.*\n?)", DefaultRegexOptions), + // Capture any character until the end of the line + // Opening 'greater than' character must be followed by whitespace + // Text content is optional + new Regex(@"^>\s(.*\n?)", DefaultRegexOptions), (s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1]))) ); - // Repeatedly capture any character until the end of the line - // This one is tricky as it ends up producing multiple separate captures which need to be joined private static readonly IMatcher RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher( - new Regex("(?:^>\\s(.*\n?)){2,}", DefaultRegexOptions), - (_, m) => - { - var content = string.Concat(m.Groups[1].Captures.Select(c => c.Value)); - return new FormattingNode(FormattingKind.Quote, Parse(content)); - } + // Repeatedly capture any character until the end of the line + // This one is tricky as it ends up producing multiple separate captures which need to be joined + new Regex(@"(?:^>\s(.*\n?)){2,}", DefaultRegexOptions), + (_, m) => new FormattingNode( + FormattingKind.Quote, + Parse( + // Combine all captures into a single string + string.Concat(m.Groups[1].Captures.Select(c => c.Value)) + ) + ) ); - // Capture any character until the end of the input - // Opening 'greater than' characters must be followed by whitespace private static readonly IMatcher MultiLineQuoteNodeMatcher = new RegexMatcher( - new Regex("^>>>\\s(.+)", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character until the end of the input + // Opening 'greater than' characters must be followed by whitespace + new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline), (s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1]))) ); /* Code blocks */ - // Capture any character except backtick until a backtick - // Blank lines at the beginning and end of content are trimmed - // There can be either one or two backticks, but equal number on both sides private static readonly IMatcher InlineCodeBlockNodeMatcher = new RegexMatcher( - new Regex("(`{1,2})([^`]+)\\1", DefaultRegexOptions | RegexOptions.Singleline), + // Capture any character except backtick until a backtick + // Blank lines at the beginning and end of content are trimmed + // There can be either one or two backticks, but equal number on both sides + new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline), (_, m) => new InlineCodeBlockNode(m.Groups[2].Value.Trim('\r', '\n')) ); - // Capture language identifier and then any character until the earliest triple backtick - // Language identifier is one word immediately after opening backticks, followed immediately by newline - // Blank lines at the beginning and end of content are trimmed private static readonly IMatcher MultiLineCodeBlockNodeMatcher = new RegexMatcher( - new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline), + // Capture language identifier and then any character until the earliest triple backtick + // Language identifier is one word immediately after opening backticks, followed immediately by newline + // Blank lines at the beginning and end of content are trimmed + new Regex(@"```(?:(\w*)\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline), (_, m) => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n')) ); /* Mentions */ - // Capture @everyone private static readonly IMatcher EveryoneMentionNodeMatcher = new StringMatcher( "@everyone", _ => new MentionNode(null, MentionKind.Everyone) ); - // Capture @here private static readonly IMatcher HereMentionNodeMatcher = new StringMatcher( "@here", _ => new MentionNode(null, MentionKind.Here) ); - // Capture <@123456> or <@!123456> private static readonly IMatcher UserMentionNodeMatcher = new RegexMatcher( - new Regex("<@!?(\\d+)>", DefaultRegexOptions), + // Capture <@123456> or <@!123456> + new Regex(@"<@!?(\d+)>", DefaultRegexOptions), (_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.User) ); - // Capture <#123456> private static readonly IMatcher ChannelMentionNodeMatcher = new RegexMatcher( - new Regex("<#!?(\\d+)>", DefaultRegexOptions), + // Capture <#123456> + new Regex(@"<\#!?(\d+)>", DefaultRegexOptions), (_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Channel) ); - // Capture <@&123456> private static readonly IMatcher RoleMentionNodeMatcher = new RegexMatcher( - new Regex("<@&(\\d+)>", DefaultRegexOptions), + // Capture <@&123456> + new Regex(@"<@&(\d+)>", DefaultRegexOptions), (_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Role) ); /* Emoji */ - // Capture any country flag emoji (two regional indicator surrogate pairs) - // ... or "miscellaneous symbol" character - // ... or surrogate pair - // ... or digit followed by enclosing mark - // (this does not match all emoji in Discord but it's reasonably accurate enough) private static readonly IMatcher StandardEmojiNodeMatcher = new RegexMatcher( - new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|[\\u2600-\\u2604\\u260E\\u2611\\u2614-\\u2615\\u2618\\u261D\\u2620\\u2622-\\u2623\\u2626\\u262A\\u262E-\\u262F\\u2638-\\u263A\\u2640\\u2642\\u2648-\\u2653\\u265F-\\u2660\\u2663\\u2665-\\u2666\\u2668\\u267B\\u267E-\\u267F\\u2692-\\u2697\\u2699\\u269B-\\u269C\\u26A0-\\u26A1\\u26A7\\u26AA-\\u26AB\\u26B0-\\u26B1\\u26BD-\\u26BE\\u26C4-\\u26C5\\u26C8\\u26CE-\\u26CF\\u26D1\\u26D3-\\u26D4\\u26E9-\\u26EA\\u26F0-\\u26F5\\u26F7-\\u26FA\\u26FD]|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions), + new Regex( + """ + ( + # Country flag emoji (two regional indicator surrogate pairs) + (?:\uD83C[\uDDE6-\uDDFF]){2}| + # Digit emoji (digit followed by enclosing mark) + \d\p{Me}| + # Surrogate pair + \p{Cs}{2}| + # Miscellaneous characters + [ + \u2600-\u2604 + \u260E\u2611 + \u2614-\u2615 + \u2618\u261D\u2620 + \u2622-\u2623 + \u2626\u262A + \u262E-\u262F + \u2638-\u263A + \u2640\u2642 + \u2648-\u2653 + \u265F-\u2660 + \u2663 + \u2665-\u2666 + \u2668\u267B + \u267E-\u267F + \u2692-\u2697 + \u2699 + \u269B-\u269C + \u26A0-\u26A1 + \u26A7 + \u26AA-\u26AB + \u26B0-\u26B1 + \u26BD-\u26BE + \u26C4-\u26C5 + \u26C8 + \u26CE-\u26CF + \u26D1 + \u26D3-\u26D4 + \u26E9-\u26EA + \u26F0-\u26F5 + \u26F7-\u26FA + \u26FD + ] + ) + """, DefaultRegexOptions), (_, m) => new EmojiNode(m.Groups[1].Value) ); - // Capture :thinking: (but only for known emoji codes) private static readonly IMatcher CodedStandardEmojiNodeMatcher = new RegexMatcher( - new Regex(":([\\w_]+):", DefaultRegexOptions), - (_, m) => - { - var name = EmojiIndex.TryGetName(m.Groups[1].Value); - return !string.IsNullOrWhiteSpace(name) - ? new EmojiNode(name) - : null; - } + // Capture :thinking: for known emoji codes + new Regex(@":([\w_]+):", DefaultRegexOptions), + (_, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n)) ); - // Capture <:lul:123456> or private static readonly IMatcher CustomEmojiNodeMatcher = new RegexMatcher( - new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions), + // Capture <:lul:123456> or + new Regex(@"<(a)?:(.+?):(\d+?)>", DefaultRegexOptions), (_, m) => new EmojiNode( Snowflake.TryParse(m.Groups[3].Value), m.Groups[2].Value, @@ -188,60 +226,60 @@ internal static partial class MarkdownParser /* Links */ - // Capture [title](link) private static readonly IMatcher TitledLinkNodeMatcher = new RegexMatcher( - new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions), + // Capture [title](link) + new Regex(@"\[(.+?)\]\((.+?)\)", DefaultRegexOptions), (s, m) => new LinkNode(m.Groups[2].Value, Parse(s.Relocate(m.Groups[1]))) ); - // Capture any non-whitespace character after http:// or https:// - // until the last punctuation character or whitespace private static readonly IMatcher AutoLinkNodeMatcher = new RegexMatcher( - new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions), + // Capture any non-whitespace character after http:// or https:// + // until the last punctuation character or whitespace + new Regex(@"(https?://\S*[^\.,:;""'\s])", DefaultRegexOptions), (_, m) => new LinkNode(m.Groups[1].Value) ); - // Same as auto link but also surrounded by angular brackets private static readonly IMatcher HiddenLinkNodeMatcher = new RegexMatcher( - new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions), + // Same as auto link but also surrounded by angular brackets + new Regex(@"<(https?://\S*[^\.,:;""'\s])>", DefaultRegexOptions), (_, m) => new LinkNode(m.Groups[1].Value) ); /* Text */ - // Capture the shrug kaomoji - // This escapes it from matching for formatting private static readonly IMatcher ShrugTextNodeMatcher = new StringMatcher( + // Capture the shrug kaomoji + // This escapes it from matching for formatting @"¯\_(ツ)_/¯", s => new TextNode(s.ToString()) ); - // Capture some specific emoji that don't get rendered - // This escapes it from matching for emoji private static readonly IMatcher IgnoredEmojiTextNodeMatcher = new RegexMatcher( - new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions), + // Capture some specific emoji that don't get rendered + // This escapes it from matching for emoji + new Regex(@"([\u26A7\u2640\u2642\u2695\u267E\u00A9\u00AE\u2122])", DefaultRegexOptions), (_, m) => new TextNode(m.Groups[1].Value) ); - // Capture any "symbol/other" character or surrogate pair preceded by a backslash - // This escapes it from matching for emoji private static readonly IMatcher EscapedSymbolTextNodeMatcher = new RegexMatcher( - new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions), + // Capture any "symbol/other" character or surrogate pair preceded by a backslash + // This escapes it from matching for emoji + new Regex(@"\\(\p{So}|\p{Cs}{2})", DefaultRegexOptions), (_, m) => new TextNode(m.Groups[1].Value) ); - // Capture any non-whitespace, non latin alphanumeric character preceded by a backslash - // This escapes it from matching for formatting or other tokens private static readonly IMatcher EscapedCharacterTextNodeMatcher = new RegexMatcher( - new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions), + // Capture any non-whitespace, non latin alphanumeric character preceded by a backslash + // This escapes it from matching for formatting or other tokens + new Regex(@"\\([^a-zA-Z0-9\s])", DefaultRegexOptions), (_, m) => new TextNode(m.Groups[1].Value) ); /* Misc */ - // Capture or private static readonly IMatcher UnixTimestampNodeMatcher = new RegexMatcher( - new Regex("", DefaultRegexOptions), + // Capture or + new Regex(@"", DefaultRegexOptions), (_, m) => { // TODO: support formatting parameters