Allow link nodes to have markdown children instead of just text

Closes #640
4 years ago · 6fa7cbe568
parent aae3790a5f
commit 6fa7cbe568
7 changed files with 117 additions and 96 deletions
--- a/DiscordChatExporter.Core/Exporting/Writers/MarkdownVisitors/HtmlMarkdownVisitor.cs
+++ b/DiscordChatExporter.Core/Exporting/Writers/MarkdownVisitors/HtmlMarkdownVisitor.cs
@ -30,22 +30,22 @@ namespace DiscordChatExporter.Core.Exporting.Writers.MarkdownVisitors
            return base.VisitText(text);
        }

-        protected override MarkdownNode VisitFormatted(FormattedNode formatted)
+        protected override MarkdownNode VisitFormatting(FormattingNode formatting)
        {
-            var (tagOpen, tagClose) = formatted.Formatting switch
+            var (tagOpen, tagClose) = formatting.Kind switch
            {
-                TextFormatting.Bold => ("<strong>", "</strong>"),
-                TextFormatting.Italic => ("<em>", "</em>"),
-                TextFormatting.Underline => ("<u>", "</u>"),
-                TextFormatting.Strikethrough => ("<s>", "</s>"),
-                TextFormatting.Spoiler => (
+                FormattingKind.Bold => ("<strong>", "</strong>"),
+                FormattingKind.Italic => ("<em>", "</em>"),
+                FormattingKind.Underline => ("<u>", "</u>"),
+                FormattingKind.Strikethrough => ("<s>", "</s>"),
+                FormattingKind.Spoiler => (
                    "<span class=\"spoiler-text spoiler-text--hidden\" onclick=\"showSpoiler(event, this)\">", "</span>"),
-                TextFormatting.Quote => ("<div class=\"quote\">", "</div>"),
-                _ => throw new ArgumentOutOfRangeException(nameof(formatted.Formatting))
+                FormattingKind.Quote => ("<div class=\"quote\">", "</div>"),
+                _ => throw new ArgumentOutOfRangeException(nameof(formatting.Kind))
            };

            _buffer.Append(tagOpen);
-            var result = base.VisitFormatted(formatted);
+            var result = base.VisitFormatting(formatting);
            _buffer.Append(tagClose);

            return result;
@ -77,25 +77,22 @@ namespace DiscordChatExporter.Core.Exporting.Writers.MarkdownVisitors

        protected override MarkdownNode VisitLink(LinkNode link)
        {
-            // Extract message ID if the link points to a Discord message
-            var linkedMessageId = Regex.Match(link.Url, "^https?://(?:discord|discordapp).com/channels/.*?/(\\d+)/?$").Groups[1].Value;
+            // Try to extract message ID if the link refers to a Discord message
+            var linkedMessageId = Regex.Match(
+                link.Url,
+                "^https?://(?:discord|discordapp).com/channels/.*?/(\\d+)/?$"
+            ).Groups[1].Value;

-            if (!string.IsNullOrWhiteSpace(linkedMessageId))
-            {
-                _buffer
-                    .Append($"<a href=\"{Uri.EscapeUriString(link.Url)}\" onclick=\"scrollToMessage(event, '{linkedMessageId}')\">")
-                    .Append(HtmlEncode(link.Title))
-                    .Append("</a>");
-            }
-            else
-            {
-                _buffer
-                    .Append($"<a href=\"{Uri.EscapeUriString(link.Url)}\">")
-                    .Append(HtmlEncode(link.Title))
-                    .Append("</a>");
-            }
+            _buffer.Append(
+                !string.IsNullOrWhiteSpace(linkedMessageId)
+                    ? $"<a href=\"{Uri.EscapeUriString(link.Url)}\" onclick=\"scrollToMessage(event, '{linkedMessageId}')\">"
+                    : $"<a href=\"{Uri.EscapeUriString(link.Url)}\">"
+            );

-            return base.VisitLink(link);
+            var result = base.VisitLink(link);
+            _buffer.Append("</a>");
+
+            return result;
        }

        protected override MarkdownNode VisitEmoji(EmojiNode emoji)
--- a/DiscordChatExporter.Core/Markdown/FormattedNode.cs
+++ b/DiscordChatExporter.Core/Markdown/FormattedNode.cs
@ -1,21 +0,0 @@
-using System.Collections.Generic;
-using System.Diagnostics.CodeAnalysis;
-
-namespace DiscordChatExporter.Core.Markdown
-{
-    internal class FormattedNode : MarkdownNode
-    {
-        public TextFormatting Formatting { get; }
-
-        public IReadOnlyList<MarkdownNode> Children { get; }
-
-        public FormattedNode(TextFormatting formatting, IReadOnlyList<MarkdownNode> children)
-        {
-            Formatting = formatting;
-            Children = children;
-        }
-
-        [ExcludeFromCodeCoverage]
-        public override string ToString() => $"<{Formatting}> (+{Children.Count})";
-    }
-}
--- a/DiscordChatExporter.Core/Markdown/FormattingKind.cs
+++ b/DiscordChatExporter.Core/Markdown/FormattingKind.cs
@ -1,6 +1,6 @@
 namespace DiscordChatExporter.Core.Markdown
 {
-    internal enum TextFormatting
+    internal enum FormattingKind
    {
        Bold,
        Italic,
--- a/DiscordChatExporter.Core/Markdown/FormattingNode.cs
+++ b/DiscordChatExporter.Core/Markdown/FormattingNode.cs
@ -0,0 +1,29 @@
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+
+namespace DiscordChatExporter.Core.Markdown
+{
+    internal class FormattingNode : MarkdownNode
+    {
+        public FormattingKind Kind { get; }
+
+        public IReadOnlyList<MarkdownNode> Children { get; }
+
+        public FormattingNode(FormattingKind kind, IReadOnlyList<MarkdownNode> children)
+        {
+            Kind = kind;
+            Children = children;
+        }
+
+        [ExcludeFromCodeCoverage]
+        public override string ToString()
+        {
+            var childrenFormatted = Children.Count == 1
+                ? Children.Single().ToString()
+                : "+" + Children.Count;
+
+            return $"<{Kind}> ({childrenFormatted})";
+        }
+    }
+}
--- a/DiscordChatExporter.Core/Markdown/LinkNode.cs
+++ b/DiscordChatExporter.Core/Markdown/LinkNode.cs
@ -1,4 +1,6 @@
-using System.Diagnostics.CodeAnalysis;
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;

 namespace DiscordChatExporter.Core.Markdown
 {
@ -6,20 +8,27 @@ namespace DiscordChatExporter.Core.Markdown
    {
        public string Url { get; }

-        public string Title { get; }
+        public IReadOnlyList<MarkdownNode> Children { get; }

-        public LinkNode(string url, string title)
+        public LinkNode(string url, IReadOnlyList<MarkdownNode> children)
        {
            Url = url;
-            Title = title;
+            Children = children;
        }

        public LinkNode(string url)
-            : this(url, url)
+            : this(url, new[] {new TextNode(url)})
        {
        }

        [ExcludeFromCodeCoverage]
-        public override string ToString() => $"<Link> {Title}";
+        public override string ToString()
+        {
+            var childrenFormatted = Children.Count == 1
+                ? Children.Single().ToString()
+                : "+" + Children.Count;
+
+            return $"<Link> ({childrenFormatted})";
+        }
    }
 }
--- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs
+++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs
@ -7,7 +7,10 @@ using DiscordChatExporter.Core.Utils;

 namespace DiscordChatExporter.Core.Markdown.Parsing
 {
-    // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
+    // Discord does NOT use a recursive-descent parser for markdown which becomes evident in some
+    // scenarios, like when multiple formatting nodes are nested together.
+    // To replicate Discord's behavior, we're employing a special parser that uses a set of regular
+    // expressions that are executed sequentially in a first-match-first-serve manner.
    internal static partial class MarkdownParser
    {
        private const RegexOptions DefaultRegexOptions =
@ -18,64 +21,64 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
        /* Formatting */

        // Capture any character until the earliest double asterisk not followed by an asterisk
-        private static readonly IMatcher<MarkdownNode> BoldFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
+        private static readonly IMatcher<MarkdownNode> BoldFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
-            (p, m) => new FormattedNode(TextFormatting.Bold, Parse(p.Slice(m.Groups[1])))
+            (p, m) => new FormattingNode(FormattingKind.Bold, Parse(p.Slice(m.Groups[1])))
        );

        // Capture any character until the earliest single asterisk not preceded or followed by an asterisk
        // Opening asterisk must not be followed by whitespace
        // Closing asterisk must not be preceded by whitespace
-        private static readonly IMatcher<MarkdownNode> ItalicFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
+        private static readonly IMatcher<MarkdownNode> ItalicFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
-            (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1])))
+            (p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1])))
        );

        // Capture any character until the earliest triple asterisk not followed by an asterisk
-        private static readonly IMatcher<MarkdownNode> ItalicBoldFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
+        private static readonly IMatcher<MarkdownNode> ItalicBoldFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
-            (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattedNodeMatcher))
+            (p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattingNodeMatcher))
        );

        // Capture any character except underscore until an underscore
        // Closing underscore must not be followed by a word character
-        private static readonly IMatcher<MarkdownNode> ItalicAltFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
+        private static readonly IMatcher<MarkdownNode> ItalicAltFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline),
-            (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1])))
+            (p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1])))
        );

        // Capture any character until the earliest double underscore not followed by an underscore
-        private static readonly IMatcher<MarkdownNode> UnderlineFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
+        private static readonly IMatcher<MarkdownNode> UnderlineFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
-            (p, m) => new FormattedNode(TextFormatting.Underline, Parse(p.Slice(m.Groups[1])))
+            (p, m) => new FormattingNode(FormattingKind.Underline, Parse(p.Slice(m.Groups[1])))
        );

        // Capture any character until the earliest triple underscore not followed by an underscore
-        private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattedNodeMatcher =
+        private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattingNodeMatcher =
            new RegexMatcher<MarkdownNode>(
                new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
-                (p, m) => new FormattedNode(TextFormatting.Italic,
-                    Parse(p.Slice(m.Groups[1]), UnderlineFormattedNodeMatcher))
+                (p, m) => new FormattingNode(FormattingKind.Italic,
+                    Parse(p.Slice(m.Groups[1]), UnderlineFormattingNodeMatcher))
            );

        // Capture any character until the earliest double tilde
-        private static readonly IMatcher<MarkdownNode> StrikethroughFormattedNodeMatcher =
+        private static readonly IMatcher<MarkdownNode> StrikethroughFormattingNodeMatcher =
            new RegexMatcher<MarkdownNode>(
                new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
-                (p, m) => new FormattedNode(TextFormatting.Strikethrough, Parse(p.Slice(m.Groups[1])))
+                (p, m) => new FormattingNode(FormattingKind.Strikethrough, Parse(p.Slice(m.Groups[1])))
            );

        // Capture any character until the earliest double pipe
-        private static readonly IMatcher<MarkdownNode> SpoilerFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
+        private static readonly IMatcher<MarkdownNode> SpoilerFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline),
-            (p, m) => new FormattedNode(TextFormatting.Spoiler, Parse(p.Slice(m.Groups[1])))
+            (p, m) => new FormattingNode(FormattingKind.Spoiler, Parse(p.Slice(m.Groups[1])))
        );

        // Capture any character until the end of the line
        // Opening 'greater than' character must be followed by whitespace
        private static readonly IMatcher<MarkdownNode> SingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("^>\\s(.+\n?)", DefaultRegexOptions),
-            (p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1])))
+            (p, m) => new FormattingNode(FormattingKind.Quote, Parse(p.Slice(m.Groups[1])))
        );

        // Repeatedly capture any character until the end of the line
@ -86,7 +89,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
                (_, m) =>
                {
                    var content = string.Concat(m.Groups[1].Captures.Select(c => c.Value));
-                    return new FormattedNode(TextFormatting.Quote, Parse(content));
+                    return new FormattingNode(FormattingKind.Quote, Parse(content));
                }
            );

@ -94,7 +97,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
        // Opening 'greater than' characters must be followed by whitespace
        private static readonly IMatcher<MarkdownNode> MultiLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("^>>>\\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
-            (p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1])))
+            (p, m) => new FormattingNode(FormattingKind.Quote, Parse(p.Slice(m.Groups[1])))
        );

        /* Code blocks */
@ -147,7 +150,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
            (_, m) => new MentionNode(m.Groups[1].Value, MentionKind.Role)
        );

-        /* Emojis */
+        /* Emoji */

        // Capture any country flag emoji (two regional indicator surrogate pairs)
        // ... or "miscellaneous symbol" character
@ -165,7 +168,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
            (_, m) =>
            {
                var name = EmojiIndex.TryGetName(m.Groups[1].Value);
-                return name is not null
+                return !string.IsNullOrWhiteSpace(name)
                    ? new EmojiNode(name)
                    : null;
            }
@ -182,10 +185,11 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
        // Capture [title](link)
        private static readonly IMatcher<MarkdownNode> TitledLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions),
-            (_, m) => new LinkNode(m.Groups[2].Value, m.Groups[1].Value)
+            (p, m) => new LinkNode(m.Groups[2].Value, Parse(p.Slice(m.Groups[1])))
        );

-        // Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace
+        // Capture any non-whitespace character after http:// or https://
+        // until the last punctuation character or whitespace
        private static readonly IMatcher<MarkdownNode> AutoLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions),
            (_, m) => new LinkNode(m.Groups[1].Value)
@ -199,14 +203,14 @@ namespace DiscordChatExporter.Core.Markdown.Parsing

        /* Text */

-        // Capture the shrug emoticon
+        // Capture the shrug kaomoji
        // This escapes it from matching for formatting
        private static readonly IMatcher<MarkdownNode> ShrugTextNodeMatcher = new StringMatcher<MarkdownNode>(
            @"¯\_(ツ)_/¯",
            p => new TextNode(p.ToString())
        );

-        // Capture some specific emojis that don't get rendered
+        // Capture some specific emoji that don't get rendered
        // This escapes it from matching for emoji
        private static readonly IMatcher<MarkdownNode> IgnoredEmojiTextNodeMatcher = new RegexMatcher<MarkdownNode>(
            new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions),
@ -257,14 +261,14 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
            EscapedCharacterTextNodeMatcher,

            // Formatting
-            ItalicBoldFormattedNodeMatcher,
-            ItalicUnderlineFormattedNodeMatcher,
-            BoldFormattedNodeMatcher,
-            ItalicFormattedNodeMatcher,
-            UnderlineFormattedNodeMatcher,
-            ItalicAltFormattedNodeMatcher,
-            StrikethroughFormattedNodeMatcher,
-            SpoilerFormattedNodeMatcher,
+            ItalicBoldFormattingNodeMatcher,
+            ItalicUnderlineFormattingNodeMatcher,
+            BoldFormattingNodeMatcher,
+            ItalicFormattingNodeMatcher,
+            UnderlineFormattingNodeMatcher,
+            ItalicAltFormattingNodeMatcher,
+            StrikethroughFormattingNodeMatcher,
+            SpoilerFormattingNodeMatcher,
            MultiLineQuoteNodeMatcher,
            RepeatedSingleLineQuoteNodeMatcher,
            SingleLineQuoteNodeMatcher,
--- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs
+++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs
@ -8,10 +8,10 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
        protected virtual MarkdownNode VisitText(TextNode text) =>
            text;

-        protected virtual MarkdownNode VisitFormatted(FormattedNode formatted)
+        protected virtual MarkdownNode VisitFormatting(FormattingNode formatting)
        {
-            Visit(formatted.Children);
-            return formatted;
+            Visit(formatting.Children);
+            return formatting;
        }

        protected virtual MarkdownNode VisitInlineCodeBlock(InlineCodeBlockNode inlineCodeBlock) =>
@ -20,8 +20,11 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
        protected virtual MarkdownNode VisitMultiLineCodeBlock(MultiLineCodeBlockNode multiLineCodeBlock) =>
            multiLineCodeBlock;

-        protected virtual MarkdownNode VisitLink(LinkNode link) =>
-            link;
+        protected virtual MarkdownNode VisitLink(LinkNode link)
+        {
+            Visit(link.Children);
+            return link;
+        }

        protected virtual MarkdownNode VisitEmoji(EmojiNode emoji) =>
            emoji;
@ -35,7 +38,7 @@ namespace DiscordChatExporter.Core.Markdown.Parsing
        public MarkdownNode Visit(MarkdownNode node) => node switch
        {
            TextNode text => VisitText(text),
-            FormattedNode formatted => VisitFormatted(formatted),
+            FormattingNode formatting => VisitFormatting(formatting),
            InlineCodeBlockNode inlineCodeBlock => VisitInlineCodeBlock(inlineCodeBlock),
            MultiLineCodeBlockNode multiLineCodeBlock => VisitMultiLineCodeBlock(multiLineCodeBlock),
            LinkNode link => VisitLink(link),