From 6fa7cbe568e2cca8f67f9eae67425d0f01893f28 Mon Sep 17 00:00:00 2001 From: Tyrrrz Date: Sat, 24 Jul 2021 17:20:28 +0300 Subject: [PATCH] Allow link nodes to have markdown children instead of just text Closes #640 --- .../MarkdownVisitors/HtmlMarkdownVisitor.cs | 51 ++++++------- .../Markdown/FormattedNode.cs | 21 ------ .../{TextFormatting.cs => FormattingKind.cs} | 2 +- .../Markdown/FormattingNode.cs | 29 ++++++++ DiscordChatExporter.Core/Markdown/LinkNode.cs | 21 ++++-- .../Markdown/Parsing/MarkdownParser.cs | 74 ++++++++++--------- .../Markdown/Parsing/MarkdownVisitor.cs | 15 ++-- 7 files changed, 117 insertions(+), 96 deletions(-) delete mode 100644 DiscordChatExporter.Core/Markdown/FormattedNode.cs rename DiscordChatExporter.Core/Markdown/{TextFormatting.cs => FormattingKind.cs} (82%) create mode 100644 DiscordChatExporter.Core/Markdown/FormattingNode.cs diff --git a/DiscordChatExporter.Core/Exporting/Writers/MarkdownVisitors/HtmlMarkdownVisitor.cs b/DiscordChatExporter.Core/Exporting/Writers/MarkdownVisitors/HtmlMarkdownVisitor.cs index 7a1c98e..ed999b9 100644 --- a/DiscordChatExporter.Core/Exporting/Writers/MarkdownVisitors/HtmlMarkdownVisitor.cs +++ b/DiscordChatExporter.Core/Exporting/Writers/MarkdownVisitors/HtmlMarkdownVisitor.cs @@ -30,22 +30,22 @@ namespace DiscordChatExporter.Core.Exporting.Writers.MarkdownVisitors return base.VisitText(text); } - protected override MarkdownNode VisitFormatted(FormattedNode formatted) + protected override MarkdownNode VisitFormatting(FormattingNode formatting) { - var (tagOpen, tagClose) = formatted.Formatting switch + var (tagOpen, tagClose) = formatting.Kind switch { - TextFormatting.Bold => ("", ""), - TextFormatting.Italic => ("", ""), - TextFormatting.Underline => ("", ""), - TextFormatting.Strikethrough => ("", ""), - TextFormatting.Spoiler => ( + FormattingKind.Bold => ("", ""), + FormattingKind.Italic => ("", ""), + FormattingKind.Underline => ("", ""), + FormattingKind.Strikethrough => ("", ""), + FormattingKind.Spoiler => ( "", ""), - TextFormatting.Quote => ("
", "
"), - _ => throw new ArgumentOutOfRangeException(nameof(formatted.Formatting)) + FormattingKind.Quote => ("
", "
"), + _ => throw new ArgumentOutOfRangeException(nameof(formatting.Kind)) }; _buffer.Append(tagOpen); - var result = base.VisitFormatted(formatted); + var result = base.VisitFormatting(formatting); _buffer.Append(tagClose); return result; @@ -77,25 +77,22 @@ namespace DiscordChatExporter.Core.Exporting.Writers.MarkdownVisitors protected override MarkdownNode VisitLink(LinkNode link) { - // Extract message ID if the link points to a Discord message - var linkedMessageId = Regex.Match(link.Url, "^https?://(?:discord|discordapp).com/channels/.*?/(\\d+)/?$").Groups[1].Value; + // Try to extract message ID if the link refers to a Discord message + var linkedMessageId = Regex.Match( + link.Url, + "^https?://(?:discord|discordapp).com/channels/.*?/(\\d+)/?$" + ).Groups[1].Value; - if (!string.IsNullOrWhiteSpace(linkedMessageId)) - { - _buffer - .Append($"") - .Append(HtmlEncode(link.Title)) - .Append(""); - } - else - { - _buffer - .Append($"") - .Append(HtmlEncode(link.Title)) - .Append(""); - } + _buffer.Append( + !string.IsNullOrWhiteSpace(linkedMessageId) + ? $"" + : $"" + ); - return base.VisitLink(link); + var result = base.VisitLink(link); + _buffer.Append(""); + + return result; } protected override MarkdownNode VisitEmoji(EmojiNode emoji) diff --git a/DiscordChatExporter.Core/Markdown/FormattedNode.cs b/DiscordChatExporter.Core/Markdown/FormattedNode.cs deleted file mode 100644 index 46781e4..0000000 --- a/DiscordChatExporter.Core/Markdown/FormattedNode.cs +++ /dev/null @@ -1,21 +0,0 @@ -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; - -namespace DiscordChatExporter.Core.Markdown -{ - internal class FormattedNode : MarkdownNode - { - public TextFormatting Formatting { get; } - - public IReadOnlyList Children { get; } - - public FormattedNode(TextFormatting formatting, IReadOnlyList children) - { - Formatting = formatting; - Children = children; - } - - [ExcludeFromCodeCoverage] - public override string ToString() => $"<{Formatting}> (+{Children.Count})"; - } -} \ No newline at end of file diff --git a/DiscordChatExporter.Core/Markdown/TextFormatting.cs b/DiscordChatExporter.Core/Markdown/FormattingKind.cs similarity index 82% rename from DiscordChatExporter.Core/Markdown/TextFormatting.cs rename to DiscordChatExporter.Core/Markdown/FormattingKind.cs index f6f30b7..6859b4f 100644 --- a/DiscordChatExporter.Core/Markdown/TextFormatting.cs +++ b/DiscordChatExporter.Core/Markdown/FormattingKind.cs @@ -1,6 +1,6 @@ namespace DiscordChatExporter.Core.Markdown { - internal enum TextFormatting + internal enum FormattingKind { Bold, Italic, diff --git a/DiscordChatExporter.Core/Markdown/FormattingNode.cs b/DiscordChatExporter.Core/Markdown/FormattingNode.cs new file mode 100644 index 0000000..03bb110 --- /dev/null +++ b/DiscordChatExporter.Core/Markdown/FormattingNode.cs @@ -0,0 +1,29 @@ +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; + +namespace DiscordChatExporter.Core.Markdown +{ + internal class FormattingNode : MarkdownNode + { + public FormattingKind Kind { get; } + + public IReadOnlyList Children { get; } + + public FormattingNode(FormattingKind kind, IReadOnlyList children) + { + Kind = kind; + Children = children; + } + + [ExcludeFromCodeCoverage] + public override string ToString() + { + var childrenFormatted = Children.Count == 1 + ? Children.Single().ToString() + : "+" + Children.Count; + + return $"<{Kind}> ({childrenFormatted})"; + } + } +} \ No newline at end of file diff --git a/DiscordChatExporter.Core/Markdown/LinkNode.cs b/DiscordChatExporter.Core/Markdown/LinkNode.cs index c680238..df4aefd 100644 --- a/DiscordChatExporter.Core/Markdown/LinkNode.cs +++ b/DiscordChatExporter.Core/Markdown/LinkNode.cs @@ -1,4 +1,6 @@ -using System.Diagnostics.CodeAnalysis; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; namespace DiscordChatExporter.Core.Markdown { @@ -6,20 +8,27 @@ namespace DiscordChatExporter.Core.Markdown { public string Url { get; } - public string Title { get; } + public IReadOnlyList Children { get; } - public LinkNode(string url, string title) + public LinkNode(string url, IReadOnlyList children) { Url = url; - Title = title; + Children = children; } public LinkNode(string url) - : this(url, url) + : this(url, new[] {new TextNode(url)}) { } [ExcludeFromCodeCoverage] - public override string ToString() => $" {Title}"; + public override string ToString() + { + var childrenFormatted = Children.Count == 1 + ? Children.Single().ToString() + : "+" + Children.Count; + + return $" ({childrenFormatted})"; + } } } \ No newline at end of file diff --git a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs index f18dda5..e69f709 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs @@ -7,7 +7,10 @@ using DiscordChatExporter.Core.Utils; namespace DiscordChatExporter.Core.Markdown.Parsing { - // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible + // Discord does NOT use a recursive-descent parser for markdown which becomes evident in some + // scenarios, like when multiple formatting nodes are nested together. + // To replicate Discord's behavior, we're employing a special parser that uses a set of regular + // expressions that are executed sequentially in a first-match-first-serve manner. internal static partial class MarkdownParser { private const RegexOptions DefaultRegexOptions = @@ -18,64 +21,64 @@ namespace DiscordChatExporter.Core.Markdown.Parsing /* Formatting */ // Capture any character until the earliest double asterisk not followed by an asterisk - private static readonly IMatcher BoldFormattedNodeMatcher = new RegexMatcher( + private static readonly IMatcher BoldFormattingNodeMatcher = new RegexMatcher( new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Bold, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Bold, Parse(p.Slice(m.Groups[1]))) ); // Capture any character until the earliest single asterisk not preceded or followed by an asterisk // Opening asterisk must not be followed by whitespace // Closing asterisk must not be preceded by whitespace - private static readonly IMatcher ItalicFormattedNodeMatcher = new RegexMatcher( + private static readonly IMatcher ItalicFormattingNodeMatcher = new RegexMatcher( new Regex("\\*(?!\\s)(.+?)(? new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1]))) ); // Capture any character until the earliest triple asterisk not followed by an asterisk - private static readonly IMatcher ItalicBoldFormattedNodeMatcher = new RegexMatcher( + private static readonly IMatcher ItalicBoldFormattingNodeMatcher = new RegexMatcher( new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattedNodeMatcher)) + (p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattingNodeMatcher)) ); // Capture any character except underscore until an underscore // Closing underscore must not be followed by a word character - private static readonly IMatcher ItalicAltFormattedNodeMatcher = new RegexMatcher( + private static readonly IMatcher ItalicAltFormattingNodeMatcher = new RegexMatcher( new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1]))) ); // Capture any character until the earliest double underscore not followed by an underscore - private static readonly IMatcher UnderlineFormattedNodeMatcher = new RegexMatcher( + private static readonly IMatcher UnderlineFormattingNodeMatcher = new RegexMatcher( new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Underline, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Underline, Parse(p.Slice(m.Groups[1]))) ); // Capture any character until the earliest triple underscore not followed by an underscore - private static readonly IMatcher ItalicUnderlineFormattedNodeMatcher = + private static readonly IMatcher ItalicUnderlineFormattingNodeMatcher = new RegexMatcher( new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Italic, - Parse(p.Slice(m.Groups[1]), UnderlineFormattedNodeMatcher)) + (p, m) => new FormattingNode(FormattingKind.Italic, + Parse(p.Slice(m.Groups[1]), UnderlineFormattingNodeMatcher)) ); // Capture any character until the earliest double tilde - private static readonly IMatcher StrikethroughFormattedNodeMatcher = + private static readonly IMatcher StrikethroughFormattingNodeMatcher = new RegexMatcher( new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Strikethrough, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Strikethrough, Parse(p.Slice(m.Groups[1]))) ); // Capture any character until the earliest double pipe - private static readonly IMatcher SpoilerFormattedNodeMatcher = new RegexMatcher( + private static readonly IMatcher SpoilerFormattingNodeMatcher = new RegexMatcher( new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Spoiler, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Spoiler, Parse(p.Slice(m.Groups[1]))) ); // Capture any character until the end of the line // Opening 'greater than' character must be followed by whitespace private static readonly IMatcher SingleLineQuoteNodeMatcher = new RegexMatcher( new Regex("^>\\s(.+\n?)", DefaultRegexOptions), - (p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Quote, Parse(p.Slice(m.Groups[1]))) ); // Repeatedly capture any character until the end of the line @@ -86,7 +89,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing (_, m) => { var content = string.Concat(m.Groups[1].Captures.Select(c => c.Value)); - return new FormattedNode(TextFormatting.Quote, Parse(content)); + return new FormattingNode(FormattingKind.Quote, Parse(content)); } ); @@ -94,7 +97,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing // Opening 'greater than' characters must be followed by whitespace private static readonly IMatcher MultiLineQuoteNodeMatcher = new RegexMatcher( new Regex("^>>>\\s(.+)", DefaultRegexOptions | RegexOptions.Singleline), - (p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1]))) + (p, m) => new FormattingNode(FormattingKind.Quote, Parse(p.Slice(m.Groups[1]))) ); /* Code blocks */ @@ -147,7 +150,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing (_, m) => new MentionNode(m.Groups[1].Value, MentionKind.Role) ); - /* Emojis */ + /* Emoji */ // Capture any country flag emoji (two regional indicator surrogate pairs) // ... or "miscellaneous symbol" character @@ -165,7 +168,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing (_, m) => { var name = EmojiIndex.TryGetName(m.Groups[1].Value); - return name is not null + return !string.IsNullOrWhiteSpace(name) ? new EmojiNode(name) : null; } @@ -182,10 +185,11 @@ namespace DiscordChatExporter.Core.Markdown.Parsing // Capture [title](link) private static readonly IMatcher TitledLinkNodeMatcher = new RegexMatcher( new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions), - (_, m) => new LinkNode(m.Groups[2].Value, m.Groups[1].Value) + (p, m) => new LinkNode(m.Groups[2].Value, Parse(p.Slice(m.Groups[1]))) ); - // Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace + // Capture any non-whitespace character after http:// or https:// + // until the last punctuation character or whitespace private static readonly IMatcher AutoLinkNodeMatcher = new RegexMatcher( new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions), (_, m) => new LinkNode(m.Groups[1].Value) @@ -199,14 +203,14 @@ namespace DiscordChatExporter.Core.Markdown.Parsing /* Text */ - // Capture the shrug emoticon + // Capture the shrug kaomoji // This escapes it from matching for formatting private static readonly IMatcher ShrugTextNodeMatcher = new StringMatcher( @"¯\_(ツ)_/¯", p => new TextNode(p.ToString()) ); - // Capture some specific emojis that don't get rendered + // Capture some specific emoji that don't get rendered // This escapes it from matching for emoji private static readonly IMatcher IgnoredEmojiTextNodeMatcher = new RegexMatcher( new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions), @@ -257,14 +261,14 @@ namespace DiscordChatExporter.Core.Markdown.Parsing EscapedCharacterTextNodeMatcher, // Formatting - ItalicBoldFormattedNodeMatcher, - ItalicUnderlineFormattedNodeMatcher, - BoldFormattedNodeMatcher, - ItalicFormattedNodeMatcher, - UnderlineFormattedNodeMatcher, - ItalicAltFormattedNodeMatcher, - StrikethroughFormattedNodeMatcher, - SpoilerFormattedNodeMatcher, + ItalicBoldFormattingNodeMatcher, + ItalicUnderlineFormattingNodeMatcher, + BoldFormattingNodeMatcher, + ItalicFormattingNodeMatcher, + UnderlineFormattingNodeMatcher, + ItalicAltFormattingNodeMatcher, + StrikethroughFormattingNodeMatcher, + SpoilerFormattingNodeMatcher, MultiLineQuoteNodeMatcher, RepeatedSingleLineQuoteNodeMatcher, SingleLineQuoteNodeMatcher, diff --git a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs index 5b98f7e..0f8154d 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs @@ -8,10 +8,10 @@ namespace DiscordChatExporter.Core.Markdown.Parsing protected virtual MarkdownNode VisitText(TextNode text) => text; - protected virtual MarkdownNode VisitFormatted(FormattedNode formatted) + protected virtual MarkdownNode VisitFormatting(FormattingNode formatting) { - Visit(formatted.Children); - return formatted; + Visit(formatting.Children); + return formatting; } protected virtual MarkdownNode VisitInlineCodeBlock(InlineCodeBlockNode inlineCodeBlock) => @@ -20,8 +20,11 @@ namespace DiscordChatExporter.Core.Markdown.Parsing protected virtual MarkdownNode VisitMultiLineCodeBlock(MultiLineCodeBlockNode multiLineCodeBlock) => multiLineCodeBlock; - protected virtual MarkdownNode VisitLink(LinkNode link) => - link; + protected virtual MarkdownNode VisitLink(LinkNode link) + { + Visit(link.Children); + return link; + } protected virtual MarkdownNode VisitEmoji(EmojiNode emoji) => emoji; @@ -35,7 +38,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing public MarkdownNode Visit(MarkdownNode node) => node switch { TextNode text => VisitText(text), - FormattedNode formatted => VisitFormatted(formatted), + FormattingNode formatting => VisitFormatting(formatting), InlineCodeBlockNode inlineCodeBlock => VisitInlineCodeBlock(inlineCodeBlock), MultiLineCodeBlockNode multiLineCodeBlock => VisitMultiLineCodeBlock(multiLineCodeBlock), LinkNode link => VisitLink(link),