Refactor `StringPart` into `StringSegment`

pull/802/head
Oleksii Holub 3 years ago
parent fc7191d74c
commit bd98f4cb6a

@ -16,7 +16,7 @@ internal class AggregateMatcher<T> : IMatcher<T>
{
}
public ParsedMatch<T>? TryMatch(StringPart stringPart)
public ParsedMatch<T>? TryMatch(StringSegment segment)
{
ParsedMatch<T>? earliestMatch = null;
@ -24,19 +24,19 @@ internal class AggregateMatcher<T> : IMatcher<T>
foreach (var matcher in _matchers)
{
// Try to match
var match = matcher.TryMatch(stringPart);
var match = matcher.TryMatch(segment);
// If there's no match - continue
if (match is null)
continue;
// If this match is earlier than previous earliest - replace
if (earliestMatch is null || match.StringPart.StartIndex < earliestMatch.StringPart.StartIndex)
if (earliestMatch is null || match.Segment.StartIndex < earliestMatch.Segment.StartIndex)
earliestMatch = match;
// If the earliest match starts at the very beginning - break,
// because it's impossible to find a match earlier than that
if (earliestMatch.StringPart.StartIndex == stringPart.StartIndex)
if (earliestMatch.Segment.StartIndex == segment.StartIndex)
break;
}

@ -5,44 +5,57 @@ namespace DiscordChatExporter.Core.Markdown.Parsing;
internal interface IMatcher<T>
{
ParsedMatch<T>? TryMatch(StringPart stringPart);
ParsedMatch<T>? TryMatch(StringSegment segment);
}
internal static class MatcherExtensions
{
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher,
StringPart stringPart, Func<StringPart, T> transformFallback)
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(
this IMatcher<T> matcher,
StringSegment segment,
Func<StringSegment, T> transformFallback)
{
// Loop through segments divided by individual matches
var currentIndex = stringPart.StartIndex;
while (currentIndex < stringPart.EndIndex)
var currentIndex = segment.StartIndex;
while (currentIndex < segment.EndIndex)
{
// Find a match within this segment
var match = matcher.TryMatch(stringPart.Slice(currentIndex, stringPart.EndIndex - currentIndex));
var match = matcher.TryMatch(
segment.Relocate(
currentIndex,
segment.EndIndex - currentIndex
)
);
// If there's no match - break
if (match is null)
break;
// If this match doesn't start immediately at current index - transform and yield fallback first
if (match.StringPart.StartIndex > currentIndex)
// If this match doesn't start immediately at the current position - transform and yield fallback first
if (match.Segment.StartIndex > currentIndex)
{
var fallbackPart = stringPart.Slice(currentIndex, match.StringPart.StartIndex - currentIndex);
yield return new ParsedMatch<T>(fallbackPart, transformFallback(fallbackPart));
var fallbackSegment = segment.Relocate(
currentIndex,
match.Segment.StartIndex - currentIndex
);
yield return new ParsedMatch<T>(fallbackSegment, transformFallback(fallbackSegment));
}
// Yield match
yield return match;
// Shift current index to the end of the match
currentIndex = match.StringPart.StartIndex + match.StringPart.Length;
currentIndex = match.Segment.StartIndex + match.Segment.Length;
}
// If EOL wasn't reached - transform and yield remaining part as fallback
if (currentIndex < stringPart.EndIndex)
// If EOL hasn't been reached - transform and yield remaining part as fallback
if (currentIndex < segment.EndIndex)
{
var fallbackPart = stringPart.Slice(currentIndex);
yield return new ParsedMatch<T>(fallbackPart, transformFallback(fallbackPart));
var fallbackSegment = segment.Relocate(
currentIndex,
segment.EndIndex - currentIndex
);
yield return new ParsedMatch<T>(fallbackSegment, transformFallback(fallbackSegment));
}
}
}

@ -23,7 +23,7 @@ internal static partial class MarkdownParser
// Capture any character until the earliest double asterisk not followed by an asterisk
private static readonly IMatcher<MarkdownNode> BoldFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Bold, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Bold, Parse(s.Relocate(m.Groups[1])))
);
// Capture any character until the earliest single asterisk not preceded or followed by an asterisk
@ -31,54 +31,54 @@ internal static partial class MarkdownParser
// Closing asterisk must not be preceded by whitespace
private static readonly IMatcher<MarkdownNode> ItalicFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1])))
);
// Capture any character until the earliest triple asterisk not followed by an asterisk
private static readonly IMatcher<MarkdownNode> ItalicBoldFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattingNodeMatcher))
(s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher))
);
// Capture any character except underscore until an underscore
// Closing underscore must not be followed by a word character
private static readonly IMatcher<MarkdownNode> ItalicAltFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Italic, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1])))
);
// Capture any character until the earliest double underscore not followed by an underscore
private static readonly IMatcher<MarkdownNode> UnderlineFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Underline, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Underline, Parse(s.Relocate(m.Groups[1])))
);
// Capture any character until the earliest triple underscore not followed by an underscore
private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Italic,
Parse(p.Slice(m.Groups[1]), UnderlineFormattingNodeMatcher))
(s, m) => new FormattingNode(FormattingKind.Italic,
Parse(s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher))
);
// Capture any character until the earliest double tilde
private static readonly IMatcher<MarkdownNode> StrikethroughFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Strikethrough, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Strikethrough, Parse(s.Relocate(m.Groups[1])))
);
// Capture any character until the earliest double pipe
private static readonly IMatcher<MarkdownNode> SpoilerFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Spoiler, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Spoiler, Parse(s.Relocate(m.Groups[1])))
);
// Capture any character until the end of the line
// Opening 'greater than' character must be followed by whitespace
private static readonly IMatcher<MarkdownNode> SingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("^>\\s(.+\n?)", DefaultRegexOptions),
(p, m) => new FormattingNode(FormattingKind.Quote, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1])))
);
// Repeatedly capture any character until the end of the line
@ -97,7 +97,7 @@ internal static partial class MarkdownParser
// Opening 'greater than' characters must be followed by whitespace
private static readonly IMatcher<MarkdownNode> MultiLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("^>>>\\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattingNode(FormattingKind.Quote, Parse(p.Slice(m.Groups[1])))
(s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1])))
);
/* Code blocks */
@ -185,7 +185,7 @@ internal static partial class MarkdownParser
// Capture [title](link)
private static readonly IMatcher<MarkdownNode> TitledLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions),
(p, m) => new LinkNode(m.Groups[2].Value, Parse(p.Slice(m.Groups[1])))
(s, m) => new LinkNode(m.Groups[2].Value, Parse(s.Relocate(m.Groups[1])))
);
// Capture any non-whitespace character after http:// or https://
@ -207,7 +207,7 @@ internal static partial class MarkdownParser
// This escapes it from matching for formatting
private static readonly IMatcher<MarkdownNode> ShrugTextNodeMatcher = new StringMatcher<MarkdownNode>(
@"¯\_(ツ)_/¯",
p => new TextNode(p.ToString())
s => new TextNode(s.ToString())
);
// Capture some specific emoji that don't get rendered
@ -323,24 +323,24 @@ internal static partial class MarkdownParser
UnixTimestampNodeMatcher
);
private static IReadOnlyList<MarkdownNode> Parse(StringPart stringPart, IMatcher<MarkdownNode> matcher) =>
private static IReadOnlyList<MarkdownNode> Parse(StringSegment segment, IMatcher<MarkdownNode> matcher) =>
matcher
.MatchAll(stringPart, p => new TextNode(p.ToString()))
.MatchAll(segment, s => new TextNode(s.ToString()))
.Select(r => r.Value)
.ToArray();
}
internal static partial class MarkdownParser
{
private static IReadOnlyList<MarkdownNode> Parse(StringPart stringPart) =>
Parse(stringPart, AggregateNodeMatcher);
private static IReadOnlyList<MarkdownNode> Parse(StringSegment segment) =>
Parse(segment, AggregateNodeMatcher);
private static IReadOnlyList<MarkdownNode> ParseMinimal(StringPart stringPart) =>
Parse(stringPart, MinimalAggregateNodeMatcher);
private static IReadOnlyList<MarkdownNode> ParseMinimal(StringSegment segment) =>
Parse(segment, MinimalAggregateNodeMatcher);
public static IReadOnlyList<MarkdownNode> Parse(string input) =>
Parse(new StringPart(input));
Parse(new StringSegment(input));
public static IReadOnlyList<MarkdownNode> ParseMinimal(string input) =>
ParseMinimal(new StringPart(input));
ParseMinimal(new StringSegment(input));
}

@ -2,13 +2,13 @@
internal class ParsedMatch<T>
{
public StringPart StringPart { get; }
public StringSegment Segment { get; }
public T Value { get; }
public ParsedMatch(StringPart stringPart, T value)
public ParsedMatch(StringSegment segment, T value)
{
StringPart = stringPart;
Segment = segment;
Value = value;
}
}

@ -6,17 +6,17 @@ namespace DiscordChatExporter.Core.Markdown.Parsing;
internal class RegexMatcher<T> : IMatcher<T>
{
private readonly Regex _regex;
private readonly Func<StringPart, Match, T?> _transform;
private readonly Func<StringSegment, Match, T?> _transform;
public RegexMatcher(Regex regex, Func<StringPart, Match, T?> transform)
public RegexMatcher(Regex regex, Func<StringSegment, Match, T?> transform)
{
_regex = regex;
_transform = transform;
}
public ParsedMatch<T>? TryMatch(StringPart stringPart)
public ParsedMatch<T>? TryMatch(StringSegment segment)
{
var match = _regex.Match(stringPart.Target, stringPart.StartIndex, stringPart.Length);
var match = _regex.Match(segment.Source, segment.StartIndex, segment.Length);
if (!match.Success)
return null;
@ -25,14 +25,14 @@ internal class RegexMatcher<T> : IMatcher<T>
// Which is super weird because regex.Match(string, int) takes the whole input in context.
// So in order to properly account for ^/$ regex tokens, we need to make sure that
// the expression also matches on the bigger part of the input.
if (!_regex.IsMatch(stringPart.Target[..stringPart.EndIndex], stringPart.StartIndex))
if (!_regex.IsMatch(segment.Source[..segment.EndIndex], segment.StartIndex))
return null;
var stringPartMatch = stringPart.Slice(match.Index, match.Length);
var value = _transform(stringPartMatch, match);
var segmentMatch = segment.Relocate(match);
var value = _transform(segmentMatch, match);
return value is not null
? new ParsedMatch<T>(stringPartMatch, value)
? new ParsedMatch<T>(segmentMatch, value)
: null;
}
}

@ -6,31 +6,31 @@ internal class StringMatcher<T> : IMatcher<T>
{
private readonly string _needle;
private readonly StringComparison _comparison;
private readonly Func<StringPart, T?> _transform;
private readonly Func<StringSegment, T?> _transform;
public StringMatcher(string needle, StringComparison comparison, Func<StringPart, T?> transform)
public StringMatcher(string needle, StringComparison comparison, Func<StringSegment, T?> transform)
{
_needle = needle;
_comparison = comparison;
_transform = transform;
}
public StringMatcher(string needle, Func<StringPart, T> transform)
public StringMatcher(string needle, Func<StringSegment, T> transform)
: this(needle, StringComparison.Ordinal, transform)
{
}
public ParsedMatch<T>? TryMatch(StringPart stringPart)
public ParsedMatch<T>? TryMatch(StringSegment segment)
{
var index = stringPart.Target.IndexOf(_needle, stringPart.StartIndex, stringPart.Length, _comparison);
var index = segment.Source.IndexOf(_needle, segment.StartIndex, segment.Length, _comparison);
if (index < 0)
return null;
var stringPartMatch = stringPart.Slice(index, _needle.Length);
var value = _transform(stringPartMatch);
var segmentMatch = segment.Relocate(index, _needle.Length);
var value = _transform(segmentMatch);
return value is not null
? new ParsedMatch<T>(stringPartMatch, value)
? new ParsedMatch<T>(segmentMatch, value)
: null;
}
}

@ -1,21 +0,0 @@
using System.Text.RegularExpressions;
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal readonly record struct StringPart(string Target, int StartIndex, int Length)
{
public int EndIndex => StartIndex + Length;
public StringPart(string target)
: this(target, 0, target.Length)
{
}
public StringPart Slice(int newStartIndex, int newLength) => new(Target, newStartIndex, newLength);
public StringPart Slice(int newStartIndex) => Slice(newStartIndex, EndIndex - newStartIndex);
public StringPart Slice(Capture capture) => Slice(capture.Index, capture.Length);
public override string ToString() => Target.Substring(StartIndex, Length);
}

@ -0,0 +1,19 @@
using System.Text.RegularExpressions;
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal readonly record struct StringSegment(string Source, int StartIndex, int Length)
{
public int EndIndex => StartIndex + Length;
public StringSegment(string target)
: this(target, 0, target.Length)
{
}
public StringSegment Relocate(int newStartIndex, int newLength) => new(Source, newStartIndex, newLength);
public StringSegment Relocate(Capture capture) => Relocate(capture.Index, capture.Length);
public override string ToString() => Source.Substring(StartIndex, Length);
}
Loading…
Cancel
Save