From 883f6074b5571646d19cfff5284159979e948df0 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 23 Nov 2024 12:26:30 +0500 Subject: [PATCH 01/26] created project Markdown --- Markdown/Markdown.sln | 16 ++++++++++++++++ Markdown/Markdown/IMd.cs | 6 ++++++ Markdown/Markdown/Markdown.csproj | 10 ++++++++++ Markdown/Markdown/Md.cs | 9 +++++++++ Markdown/Markdown/Program.cs | 1 + 5 files changed, 42 insertions(+) create mode 100644 Markdown/Markdown.sln create mode 100644 Markdown/Markdown/IMd.cs create mode 100644 Markdown/Markdown/Markdown.csproj create mode 100644 Markdown/Markdown/Md.cs create mode 100644 Markdown/Markdown/Program.cs diff --git a/Markdown/Markdown.sln b/Markdown/Markdown.sln new file mode 100644 index 000000000..18722da42 --- /dev/null +++ b/Markdown/Markdown.sln @@ -0,0 +1,16 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown", "Markdown\Markdown.csproj", "{B8FD8A48-C2C3-434B-953F-B9AF324E3E95}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/Markdown/Markdown/IMd.cs b/Markdown/Markdown/IMd.cs new file mode 100644 index 000000000..2aae2bc83 --- /dev/null +++ b/Markdown/Markdown/IMd.cs @@ -0,0 +1,6 @@ +namespace Markdown; + +public interface IMd +{ + public string Render(StreamReader reader); +} \ No newline at end of file diff --git a/Markdown/Markdown/Markdown.csproj b/Markdown/Markdown/Markdown.csproj new file mode 100644 index 000000000..2f4fc7765 --- /dev/null +++ b/Markdown/Markdown/Markdown.csproj @@ -0,0 +1,10 @@ + + + + Exe + net8.0 + enable + enable + + + diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs new file mode 100644 index 000000000..9b420c377 --- /dev/null +++ b/Markdown/Markdown/Md.cs @@ -0,0 +1,9 @@ +namespace Markdown; + +public class Md : IMd +{ + public string Render(StreamReader reader) + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Program.cs b/Markdown/Markdown/Program.cs new file mode 100644 index 000000000..5f282702b --- /dev/null +++ b/Markdown/Markdown/Program.cs @@ -0,0 +1 @@ + \ No newline at end of file From c1f6b4d4596ad01991b8aea0b0fbc835f9e8b27b Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sun, 24 Nov 2024 16:47:22 +0500 Subject: [PATCH 02/26] initial design --- Markdown/Markdown/IMd.cs | 2 +- Markdown/Markdown/Md.cs | 41 ++++++++++++++++++- .../TokenConverters/BoldHtmlConverter.cs | 13 ++++++ .../TokenConverters/HeadingHtmlConverter.cs | 13 ++++++ .../TokenConverters/HtmlTokenConverter.cs | 11 +++++ .../TokenConverters/ITokenConverter.cs | 9 ++++ .../TokenConverters/ItalicHtmlConverter.cs | 13 ++++++ Markdown/Markdown/Tokenizers/BoldTokenizer.cs | 13 ++++++ .../Markdown/Tokenizers/HeadingTokenizer.cs | 13 ++++++ Markdown/Markdown/Tokenizers/ITokenizer.cs | 8 ++++ .../Markdown/Tokenizers/ItalicTokenizer.cs | 14 +++++++ .../Markdown/Tokenizers/MarkdownTokenizer.cs | 10 +++++ Markdown/Markdown/Tokens/MarkdownTokenType.cs | 9 ++++ Markdown/Markdown/Tokens/Token.cs | 13 ++++++ 14 files changed, 179 insertions(+), 3 deletions(-) create mode 100644 Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs create mode 100644 Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs create mode 100644 Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs create mode 100644 Markdown/Markdown/TokenConverters/ITokenConverter.cs create mode 100644 Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs create mode 100644 Markdown/Markdown/Tokenizers/BoldTokenizer.cs create mode 100644 Markdown/Markdown/Tokenizers/HeadingTokenizer.cs create mode 100644 Markdown/Markdown/Tokenizers/ITokenizer.cs create mode 100644 Markdown/Markdown/Tokenizers/ItalicTokenizer.cs create mode 100644 Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs create mode 100644 Markdown/Markdown/Tokens/MarkdownTokenType.cs create mode 100644 Markdown/Markdown/Tokens/Token.cs diff --git a/Markdown/Markdown/IMd.cs b/Markdown/Markdown/IMd.cs index 2aae2bc83..2e029cbf3 100644 --- a/Markdown/Markdown/IMd.cs +++ b/Markdown/Markdown/IMd.cs @@ -2,5 +2,5 @@ namespace Markdown; public interface IMd { - public string Render(StreamReader reader); + public string Render(string markdown); } \ No newline at end of file diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs index 9b420c377..1652cb2b0 100644 --- a/Markdown/Markdown/Md.cs +++ b/Markdown/Markdown/Md.cs @@ -1,9 +1,46 @@ +using System.Text; +using Markdown.TokenConverters; +using Markdown.Tokenizers; +using Markdown.Tokens; + namespace Markdown; public class Md : IMd { - public string Render(StreamReader reader) + private readonly ITokenizer[] _tokenizers = + { + new BoldTokenizer(), + new ItalicTokenizer(), + new HeadingTokenizer() + }; + private readonly ITokenConverter[] _converters = + { + new BoldHtmlConverter(), + new ItalicHtmlConverter(), + new HeadingHtmlConverter() + }; + + private readonly int _sliceSize = 1024; + + public string Render(string markdown) { - throw new NotImplementedException(); + var markdownSpan = markdown.AsSpan(); + var context = new StringBuilder(); + var stepCount = markdownSpan.Length / _sliceSize; + stepCount = markdownSpan.Length % _sliceSize == 0 ? stepCount : stepCount + 1; + var tokenList = new List(); + for (var step = 0; step < stepCount; step++) + { + var sliceStart = step * _sliceSize; + var sliceSize = Math.Min(_sliceSize, markdownSpan.Length - sliceStart); + var stepSpan = markdownSpan.Slice(sliceStart, sliceSize); + foreach (var tokenizer in _tokenizers) + tokenList.AddRange(tokenizer.Tokenize(stepSpan)); + foreach (var converter in _converters) + converter.ProcessTokens(tokenList, context); + tokenList.Clear(); + } + + return context.ToString(); } } \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs b/Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs new file mode 100644 index 000000000..49755a6a0 --- /dev/null +++ b/Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs @@ -0,0 +1,13 @@ +using System.Text; +using Markdown.Tokens; + +namespace Markdown.TokenConverters; + +public class BoldHtmlConverter : HtmlTokenConverter +{ + protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToBold; + public override void ProcessTokens(IEnumerable tokens, StringBuilder context) + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs b/Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs new file mode 100644 index 000000000..63ad202aa --- /dev/null +++ b/Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs @@ -0,0 +1,13 @@ +using System.Text; +using Markdown.Tokens; + +namespace Markdown.TokenConverters; + +public class HeadingHtmlConverter : HtmlTokenConverter +{ + protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToHeading; + public override void ProcessTokens(IEnumerable tokens, StringBuilder context) + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs b/Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs new file mode 100644 index 000000000..fec2e1a1b --- /dev/null +++ b/Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs @@ -0,0 +1,11 @@ +using System.Text; +using Markdown.Tokens; + +namespace Markdown.TokenConverters; + +public abstract class HtmlTokenConverter : ITokenConverter +{ + protected IEnumerable? TokensOnPreviousSlice { get; set; } + protected abstract MarkdownTokenType MarkdownTokenType { get; } + public abstract void ProcessTokens(IEnumerable tokens, StringBuilder context); +} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/ITokenConverter.cs b/Markdown/Markdown/TokenConverters/ITokenConverter.cs new file mode 100644 index 000000000..59279e627 --- /dev/null +++ b/Markdown/Markdown/TokenConverters/ITokenConverter.cs @@ -0,0 +1,9 @@ +using System.Text; +using Markdown.Tokens; + +namespace Markdown.TokenConverters; + +public interface ITokenConverter +{ + public void ProcessTokens(IEnumerable tokens, StringBuilder context); +} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs b/Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs new file mode 100644 index 000000000..f5da61016 --- /dev/null +++ b/Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs @@ -0,0 +1,13 @@ +using System.Text; +using Markdown.Tokens; + +namespace Markdown.TokenConverters; + +public class ItalicHtmlConverter : HtmlTokenConverter +{ + protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToItalic; + public override void ProcessTokens(IEnumerable tokens, StringBuilder context) + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/BoldTokenizer.cs b/Markdown/Markdown/Tokenizers/BoldTokenizer.cs new file mode 100644 index 000000000..54fe0fe0c --- /dev/null +++ b/Markdown/Markdown/Tokenizers/BoldTokenizer.cs @@ -0,0 +1,13 @@ +using Markdown.Tokens; + +namespace Markdown.Tokenizers; + +public class BoldTokenizer : MarkdownTokenizer +{ + protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToBold; + + public override IEnumerable Tokenize(ReadOnlySpan input) + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/HeadingTokenizer.cs b/Markdown/Markdown/Tokenizers/HeadingTokenizer.cs new file mode 100644 index 000000000..9fd281b33 --- /dev/null +++ b/Markdown/Markdown/Tokenizers/HeadingTokenizer.cs @@ -0,0 +1,13 @@ +using Markdown.Tokens; + +namespace Markdown.Tokenizers; + +public class HeadingTokenizer : MarkdownTokenizer +{ + protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToHeading; + + public override IEnumerable Tokenize(ReadOnlySpan input) + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/ITokenizer.cs b/Markdown/Markdown/Tokenizers/ITokenizer.cs new file mode 100644 index 000000000..79f231bb0 --- /dev/null +++ b/Markdown/Markdown/Tokenizers/ITokenizer.cs @@ -0,0 +1,8 @@ +using Markdown.Tokens; + +namespace Markdown.Tokenizers; + +public interface ITokenizer +{ + public IEnumerable Tokenize(ReadOnlySpan input); +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/ItalicTokenizer.cs b/Markdown/Markdown/Tokenizers/ItalicTokenizer.cs new file mode 100644 index 000000000..5d49725cb --- /dev/null +++ b/Markdown/Markdown/Tokenizers/ItalicTokenizer.cs @@ -0,0 +1,14 @@ +using System.Text; +using Markdown.Tokens; + +namespace Markdown.Tokenizers; + +public class ItalicTokenizer : MarkdownTokenizer +{ + protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToItalic; + + public override IEnumerable Tokenize(ReadOnlySpan input) + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs b/Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs new file mode 100644 index 000000000..eddd05b28 --- /dev/null +++ b/Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs @@ -0,0 +1,10 @@ +using System.Text; +using Markdown.Tokens; + +namespace Markdown.Tokenizers; + +public abstract class MarkdownTokenizer : ITokenizer +{ + protected abstract MarkdownTokenType MarkdownTokenType { get; } + public abstract IEnumerable Tokenize(ReadOnlySpan input); +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokens/MarkdownTokenType.cs b/Markdown/Markdown/Tokens/MarkdownTokenType.cs new file mode 100644 index 000000000..10eb9d8be --- /dev/null +++ b/Markdown/Markdown/Tokens/MarkdownTokenType.cs @@ -0,0 +1,9 @@ +namespace Markdown.Tokens; + +public enum MarkdownTokenType +{ + NoConversion, + ToItalic, + ToBold, + ToHeading +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokens/Token.cs b/Markdown/Markdown/Tokens/Token.cs new file mode 100644 index 000000000..a21e3acd6 --- /dev/null +++ b/Markdown/Markdown/Tokens/Token.cs @@ -0,0 +1,13 @@ +namespace Markdown.Tokens; + +public readonly struct Token( + ReadOnlyMemory content, + MarkdownTokenType markdownTokenType, + int startIndex, + int endIndex) +{ + public ReadOnlyMemory Content { get; } = content; + public MarkdownTokenType MarkdownTokenType { get; } = markdownTokenType; + public int StartIndex { get; } = startIndex; + public int EndIndex { get; } = endIndex; +} \ No newline at end of file From 09601ae381b3cb4a15cbce47df5c989a29f3d3ef Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 30 Nov 2024 13:30:55 +0500 Subject: [PATCH 03/26] initial design --- .../AbstractSyntaxTree/IAbstractSyntaxTree.cs | 8 ++++ .../MdAbstractSyntaxTree.cs | 37 +++++++++++++++++++ Markdown/Markdown/IMd.cs | 2 +- Markdown/Markdown/Md.cs | 5 ++- Markdown/Markdown/TokenType.cs | 9 +++++ Markdown/Markdown/Tokenizer/ITokenizer.cs | 9 +++++ Markdown/Markdown/Tokenizer/MdTokenizer.cs | 12 ++++++ 7 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs create mode 100644 Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs create mode 100644 Markdown/Markdown/TokenType.cs create mode 100644 Markdown/Markdown/Tokenizer/ITokenizer.cs create mode 100644 Markdown/Markdown/Tokenizer/MdTokenizer.cs diff --git a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs new file mode 100644 index 000000000..ed164d78b --- /dev/null +++ b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs @@ -0,0 +1,8 @@ +namespace Markdown.AbstractSyntaxTree; + +public interface IAbstractSyntaxTree +where TTokenType : Enum +{ + public void AddToken(TTokenType tokenType, ReadOnlyMemory? tokenValue = null); + public string ToText(); +} \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs new file mode 100644 index 000000000..fa61632c2 --- /dev/null +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -0,0 +1,37 @@ +using System.Collections.ObjectModel; + +namespace Markdown.AbstractSyntaxTree; + +public class MdAbstractSyntaxTree(ReadOnlyDictionary TokenConverters) : IAbstractSyntaxTree +{ + private struct Node + { + public Node() + { + Children = new List(); + } + + public Node(TokenType tokenType, ReadOnlyMemory? tokenValue) + { + TokenType = tokenType; + TokenValue = tokenValue; + Children = new List(); + } + + public TokenType? TokenType; + public ReadOnlyMemory? TokenValue; + public List Children; + } + + private Node _root = new(); + + public void AddToken(TokenType tokenType, ReadOnlyMemory? tokenValue = null) + { + throw new NotImplementedException(); + } + + public string ToText() + { + throw new NotImplementedException(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/IMd.cs b/Markdown/Markdown/IMd.cs index 2aae2bc83..c77626d39 100644 --- a/Markdown/Markdown/IMd.cs +++ b/Markdown/Markdown/IMd.cs @@ -2,5 +2,5 @@ namespace Markdown; public interface IMd { - public string Render(StreamReader reader); + public string Render(string input); } \ No newline at end of file diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs index 9b420c377..c76403d83 100644 --- a/Markdown/Markdown/Md.cs +++ b/Markdown/Markdown/Md.cs @@ -1,8 +1,11 @@ +using System.Collections.ObjectModel; +using Markdown.Tokenizer; + namespace Markdown; public class Md : IMd { - public string Render(StreamReader reader) + public string Render(string input) { throw new NotImplementedException(); } diff --git a/Markdown/Markdown/TokenType.cs b/Markdown/Markdown/TokenType.cs new file mode 100644 index 000000000..5816793d9 --- /dev/null +++ b/Markdown/Markdown/TokenType.cs @@ -0,0 +1,9 @@ +namespace Markdown; + +public enum TokenType +{ + PlainText, + Italic, + Bold, + Heading +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/ITokenizer.cs b/Markdown/Markdown/Tokenizer/ITokenizer.cs new file mode 100644 index 000000000..721ac5893 --- /dev/null +++ b/Markdown/Markdown/Tokenizer/ITokenizer.cs @@ -0,0 +1,9 @@ +using Markdown.AbstractSyntaxTree; + +namespace Markdown.Tokenizer; + +public interface ITokenizer +where TTokenType : Enum +{ + public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tree, ReadOnlyMemory input); +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer.cs b/Markdown/Markdown/Tokenizer/MdTokenizer.cs new file mode 100644 index 000000000..98ba00688 --- /dev/null +++ b/Markdown/Markdown/Tokenizer/MdTokenizer.cs @@ -0,0 +1,12 @@ +using System.Collections.ObjectModel; +using Markdown.AbstractSyntaxTree; + +namespace Markdown.Tokenizer; + +public class MdTokenizer(ReadOnlyDictionary TokenAliases) : ITokenizer +{ + public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tree, ReadOnlyMemory input) + { + throw new NotImplementedException(); + } +} \ No newline at end of file From f27204e7839e40979feacd75dce703a3f8c8afed Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 30 Nov 2024 15:40:37 +0500 Subject: [PATCH 04/26] added ArgumentExceptionHelpers --- Markdown/Markdown/ArgumentExceptionHelpers.cs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 Markdown/Markdown/ArgumentExceptionHelpers.cs diff --git a/Markdown/Markdown/ArgumentExceptionHelpers.cs b/Markdown/Markdown/ArgumentExceptionHelpers.cs new file mode 100644 index 000000000..2fb150dc7 --- /dev/null +++ b/Markdown/Markdown/ArgumentExceptionHelpers.cs @@ -0,0 +1,16 @@ +namespace Markdown; + +public static class ArgumentExceptionHelpers +{ + public static void ThrowIfFalse(bool flag, string message) + { + if (!flag) + throw new ArgumentException(message); + } + + public static void ThrowIfNull(object? obj, string message) + { + if (obj == null) + throw new ArgumentException(message); + } +} \ No newline at end of file From bc620388c85e26bf8b493493e367a6fa43844a01 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 30 Nov 2024 15:41:51 +0500 Subject: [PATCH 05/26] implemented abstract syntax tree --- .../AbstractSyntaxTree/IAbstractSyntaxTree.cs | 2 + .../MdAbstractSyntaxTree.cs | 95 +++++++++++++++++-- 2 files changed, 89 insertions(+), 8 deletions(-) diff --git a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs index ed164d78b..e8c1c3acc 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs @@ -4,5 +4,7 @@ public interface IAbstractSyntaxTree where TTokenType : Enum { public void AddToken(TTokenType tokenType, ReadOnlyMemory? tokenValue = null); + public bool TryEndCurrentToken(); + public bool TryEndToken(TokenType tokenType); public string ToText(); } \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs index fa61632c2..98adb7c3f 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -1,10 +1,11 @@ using System.Collections.ObjectModel; +using System.Text; namespace Markdown.AbstractSyntaxTree; -public class MdAbstractSyntaxTree(ReadOnlyDictionary TokenConverters) : IAbstractSyntaxTree +public class MdAbstractSyntaxTree : IAbstractSyntaxTree { - private struct Node + private class Node { public Node() { @@ -18,20 +19,98 @@ public Node(TokenType tokenType, ReadOnlyMemory? tokenValue) Children = new List(); } - public TokenType? TokenType; - public ReadOnlyMemory? TokenValue; - public List Children; + public TokenType? TokenType { get; } + public ReadOnlyMemory? TokenValue { get; } + public Node? Parent { get; private set; } + private List Children { get; } + + public void AddChild(Node node) + { + node.Parent = this; + Children.Add(node); + } + + public IEnumerable GetChildren() => Children; } - private Node _root = new(); + private readonly ReadOnlyDictionary _tokenTags; + private readonly Node _root; + private Node _current; + + public MdAbstractSyntaxTree(ReadOnlyDictionary tokenTags) + { + _tokenTags = tokenTags; + _root = new Node(); + _current = _root; + } public void AddToken(TokenType tokenType, ReadOnlyMemory? tokenValue = null) { - throw new NotImplementedException(); + if (tokenType == TokenType.PlainText) + { + ArgumentExceptionHelpers.ThrowIfNull(tokenValue, "tokenValue must not be null"); + _current.AddChild(new Node(tokenType, tokenValue)); + } + else + { + var newNode = new Node(tokenType, null); + _current.AddChild(newNode); + _current = newNode; + } + } + + public bool TryEndCurrentToken() + { + if (_current == _root) + return false; + _current = _current.Parent!; + return true; + } + + public bool TryEndToken(TokenType tokenType) + { + var tokenNode = FindTokenNode(_current, tokenType); + if (tokenNode != null) + { + _current = tokenNode.Parent!; + return true; + } + + return false; + } + + private Node? FindTokenNode(Node node, TokenType tokenType) + { + if (node == _root) + return null; + if (node.TokenType == tokenType) + return node; + + return FindTokenNode(node.Parent!, tokenType); } public string ToText() { - throw new NotImplementedException(); + var sb = new StringBuilder(); + ProcessChildren(_root, sb); + return sb.ToString(); + } + + private void ProcessChildren(Node node, StringBuilder sb) + { + foreach (var child in node.GetChildren()) + { + if (child.TokenType == TokenType.PlainText) + sb.Append(child.TokenValue); + else + SurroundWithTag(_tokenTags[child.TokenType!.Value], child, sb); + } + } + + private void SurroundWithTag(string tag, Node node, StringBuilder sb) + { + sb.Append($"<{tag}>"); + ProcessChildren(node, sb); + sb.Append($""); } } \ No newline at end of file From f0d106e3940942fa09f68ca68fd503527cf030c5 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 30 Nov 2024 19:37:22 +0500 Subject: [PATCH 06/26] did some syntax tree testing --- .../AbstractSyntaxTreeTests.cs | 128 ++++++++++++++++++ Markdown/Markdown.Tests/Markdown.Tests.csproj | 20 +++ Markdown/Markdown.sln | 6 + 3 files changed, 154 insertions(+) create mode 100644 Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs create mode 100644 Markdown/Markdown.Tests/Markdown.Tests.csproj diff --git a/Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs b/Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs new file mode 100644 index 000000000..88abd901a --- /dev/null +++ b/Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs @@ -0,0 +1,128 @@ +using System; +using System.Collections.Generic; +using FluentAssertions; +using Markdown.AbstractSyntaxTree; +using NUnit.Framework; +using MdAbstractSyntaxTree = Markdown.AbstractSyntaxTree.MdAbstractSyntaxTree; + +namespace Markdown.Tests.AbstractSyntaxTree; + +[TestFixture] +[TestOf(typeof(MdAbstractSyntaxTree))] +public class AbstractSyntaxTreeTests +{ + private IAbstractSyntaxTree _syntaxTree; + + [SetUp] + public void SetUp() + { + var tags = new Dictionary(); + tags.Add(TokenType.Italic, "em"); + tags.Add(TokenType.Bold, "strong"); + tags.Add(TokenType.Heading, "h1"); + _syntaxTree = new MdAbstractSyntaxTree(tags.AsReadOnly()); + } + + [Test] + [Description("Проверяем, что метод AddToken кидает исключение если тип токена == PlainText" + + " и содержимое токена, переданное в аргумент имеет значение null")] + [TestCase(TokenType.PlainText, "", false)] + [TestCase(TokenType.PlainText, "some text", false)] + [TestCase(TokenType.PlainText, null, true)] + [TestCase(TokenType.Italic, "some text", false)] + [TestCase(TokenType.Italic, null, false)] + public void AddToken_ThrowsWhen_TokenValueIsNull_And_TokenTypeIsNotPlainText( + TokenType tokenType, + string value, + bool expectedException = true) + { + Action act = () => _syntaxTree.AddToken(tokenType, value?.AsMemory()); + + if (expectedException) + act.Should() + .Throw() + .WithMessage("tokenValue must not be null"); + else + act.Should().NotThrow(); + } + + [Test] + [Description("Проверяем, что набор токенов правильно переводится в текст")] + [TestCaseSource(nameof(AddTokenTestSource))] + public void TreeConvertsTokensToTextCorrectly(TestToken[] tokens, string expectedResult) + { + AddTokens(tokens); + + _syntaxTree.ToText() + .Should() + .Be(expectedResult); + } + + public readonly struct TestToken(TokenType tokenType, string value = null, bool endToken = false) + { + public readonly bool EndToken = endToken; + public readonly TokenType TokenType = tokenType; + public readonly string Value = value; + } + + private void AddTokens(TestToken[] tokens) + { + foreach (var token in tokens) + { + if (token.TokenType == TokenType.PlainText) + _syntaxTree.AddToken(token.TokenType, token.Value.AsMemory()); + else if (token.EndToken && token.TokenType == TokenType.Heading) + _syntaxTree.TryEndToken(TokenType.Heading); + else if (token.EndToken) + _syntaxTree.TryEndCurrentToken(); + else + _syntaxTree.AddToken(token.TokenType); + } + } + + public static object[] AddTokenTestSource = + { + new object[] + { + new TestToken[] + { + new TestToken(TokenType.PlainText, "Next "), + new TestToken(TokenType.Italic), + new TestToken(TokenType.PlainText, "word"), + new TestToken(TokenType.Italic, null, true), + new TestToken(TokenType.PlainText, " is italic") + }, + "Next word is italic" + }, + new object[] + { + new TestToken[] + { + new TestToken(TokenType.Heading), + new TestToken(TokenType.PlainText, "This text is a heading. And "), + new TestToken(TokenType.Italic), + new TestToken(TokenType.PlainText, "this"), + new TestToken(TokenType.Italic, null, true), + new TestToken(TokenType.PlainText, " word is italic. And "), + new TestToken(TokenType.Bold), + new TestToken(TokenType.PlainText, "these words"), + new TestToken(TokenType.Bold, null, true), + new TestToken(TokenType.PlainText, " are in bold.") + }, + "

This text is a heading. And this word is italic. And these words are in bold.

" + }, + new object[] + { + new TestToken[] + { + new TestToken(TokenType.Heading), + new TestToken(TokenType.PlainText, "This text is a heading. And "), + new TestToken(TokenType.Bold), + new TestToken(TokenType.PlainText, "these words are in bold."), + new TestToken(TokenType.Heading, null, true), + new TestToken(TokenType.PlainText, "\nThis text is after a heading.") + }, + "

This text is a heading. And these words are in bold.

\nThis text is after a heading." + } + }; +} \ No newline at end of file diff --git a/Markdown/Markdown.Tests/Markdown.Tests.csproj b/Markdown/Markdown.Tests/Markdown.Tests.csproj new file mode 100644 index 000000000..94db3bb71 --- /dev/null +++ b/Markdown/Markdown.Tests/Markdown.Tests.csproj @@ -0,0 +1,20 @@ + + + + net8.0 + + false + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Markdown/Markdown.sln b/Markdown/Markdown.sln index 18722da42..bd31bed26 100644 --- a/Markdown/Markdown.sln +++ b/Markdown/Markdown.sln @@ -2,6 +2,8 @@ Microsoft Visual Studio Solution File, Format Version 12.00 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown", "Markdown\Markdown.csproj", "{B8FD8A48-C2C3-434B-953F-B9AF324E3E95}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown.Tests", "Markdown.Tests\Markdown.Tests.csproj", "{0B1D2315-E457-4F38-92C9-5BC11A8752B6}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -12,5 +14,9 @@ Global {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Debug|Any CPU.Build.0 = Debug|Any CPU {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Release|Any CPU.ActiveCfg = Release|Any CPU {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Release|Any CPU.Build.0 = Release|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal From 106f1138827da957c4b88c8e0e24cd131c0480c9 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Mon, 2 Dec 2024 16:48:12 +0500 Subject: [PATCH 07/26] tokenizer WIP --- .../AbstractSyntaxTree/IAbstractSyntaxTree.cs | 6 +- .../MdAbstractSyntaxTree.cs | 67 +++-- Markdown/Markdown/Md.cs | 30 ++- Markdown/Markdown/Tokenizer/MdTokenizer.cs | 237 +++++++++++++++++- 4 files changed, 311 insertions(+), 29 deletions(-) diff --git a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs index e8c1c3acc..c84cc8255 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs @@ -3,8 +3,8 @@ namespace Markdown.AbstractSyntaxTree; public interface IAbstractSyntaxTree where TTokenType : Enum { - public void AddToken(TTokenType tokenType, ReadOnlyMemory? tokenValue = null); - public bool TryEndCurrentToken(); - public bool TryEndToken(TokenType tokenType); + public void AddToken(TTokenType tokenType, ReadOnlyMemory tokenValue); + public bool HasTokenInContext(TTokenType tokenType); + public void EndToken(TTokenType tokenType, ReadOnlyMemory? tokenValue = null); public string ToText(); } \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs index 98adb7c3f..1f6d4a44b 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -30,7 +30,14 @@ public void AddChild(Node node) Children.Add(node); } - public IEnumerable GetChildren() => Children; + public IEnumerable RemoveChildren() + { + var children = new List(GetChildren()); + Children.Clear(); + return children; + } + + public IEnumerable GetChildren() => Children.AsReadOnly(); } private readonly ReadOnlyDictionary _tokenTags; @@ -44,49 +51,63 @@ public MdAbstractSyntaxTree(ReadOnlyDictionary tokenTags) _current = _root; } - public void AddToken(TokenType tokenType, ReadOnlyMemory? tokenValue = null) + public void AddToken(TokenType tokenType, ReadOnlyMemory tokenValue) { + ArgumentExceptionHelpers.ThrowIfNull(tokenValue, "tokenValue must not be null"); if (tokenType == TokenType.PlainText) { - ArgumentExceptionHelpers.ThrowIfNull(tokenValue, "tokenValue must not be null"); _current.AddChild(new Node(tokenType, tokenValue)); } else { - var newNode = new Node(tokenType, null); + var newNode = new Node(tokenType, tokenValue); _current.AddChild(newNode); _current = newNode; } } - public bool TryEndCurrentToken() + public bool HasTokenInContext(TokenType tokenType) => HasParent(tokenType, _current); + + private bool HasParent(TokenType tokenType, Node node) { - if (_current == _root) + if (node == _root) return false; - _current = _current.Parent!; - return true; + if (node.TokenType == tokenType) + return true; + return HasParent(tokenType, node.Parent!); } - public bool TryEndToken(TokenType tokenType) + public void EndToken(TokenType tokenType, ReadOnlyMemory? tokenValue = null) { - var tokenNode = FindTokenNode(_current, tokenType); - if (tokenNode != null) + WalkUpToTheRoot(_current, tokenType, tokenValue); + } + + private void WalkUpToTheRoot(Node node, TokenType tokenType, ReadOnlyMemory? tokenValue) + { + if (node == _root) { - _current = tokenNode.Parent!; - return true; + AddTextToNodeAndMakeCurrent(node, tokenValue); + } + else if (node.TokenType == tokenType) + { + AddTextToNodeAndMakeCurrent(node, tokenValue); + _current = node.Parent!; + } + else + { + var parent = node.Parent!; + var children = node.RemoveChildren(); + foreach (var child in children) + parent.AddChild(child); + WalkUpToTheRoot(parent, tokenType, tokenValue); } - - return false; } - private Node? FindTokenNode(Node node, TokenType tokenType) + private void AddTextToNodeAndMakeCurrent(Node node, ReadOnlyMemory? tokenValue) { - if (node == _root) - return null; - if (node.TokenType == tokenType) - return node; - - return FindTokenNode(node.Parent!, tokenType); + _current = node; + if (tokenValue != null) + AddToken(TokenType.PlainText, tokenValue.Value); } public string ToText() @@ -100,7 +121,7 @@ private void ProcessChildren(Node node, StringBuilder sb) { foreach (var child in node.GetChildren()) { - if (child.TokenType == TokenType.PlainText) + if (child.TokenType == TokenType.PlainText || !child.GetChildren().Any()) sb.Append(child.TokenValue); else SurroundWithTag(_tokenTags[child.TokenType!.Value], child, sb); diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs index c76403d83..9bad90d2e 100644 --- a/Markdown/Markdown/Md.cs +++ b/Markdown/Markdown/Md.cs @@ -1,12 +1,40 @@ using System.Collections.ObjectModel; +using Markdown.AbstractSyntaxTree; using Markdown.Tokenizer; namespace Markdown; public class Md : IMd { + private readonly ReadOnlyDictionary _tokenAliases; + private readonly ReadOnlyDictionary _endTokenAliases; + private readonly ReadOnlyDictionary _tokenTags; + + public Md() + { + var tokenAliases = new Dictionary(); + tokenAliases.Add("_", TokenType.Italic); + tokenAliases.Add("__", TokenType.Bold); + tokenAliases.Add("# ", TokenType.Heading); + _tokenAliases = tokenAliases.AsReadOnly(); + + var endTokenAliases = new Dictionary(); + endTokenAliases.Add("_", TokenType.Italic); + endTokenAliases.Add("__", TokenType.Bold); + endTokenAliases.Add("\n", TokenType.Heading); + _endTokenAliases = endTokenAliases.AsReadOnly(); + + var tokenTags = new Dictionary(); + tokenTags.Add(TokenType.Italic, "em"); + tokenTags.Add(TokenType.Bold, "strong"); + tokenTags.Add(TokenType.Heading, "h1"); + _tokenTags = tokenTags.AsReadOnly(); + } + public string Render(string input) { - throw new NotImplementedException(); + var tokenizer = new MdTokenizer(_tokenAliases, _endTokenAliases); + var syntaxTree = tokenizer.Tokenize(new MdAbstractSyntaxTree(_tokenTags), input.AsMemory()); + return syntaxTree.ToText(); } } \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer.cs b/Markdown/Markdown/Tokenizer/MdTokenizer.cs index 98ba00688..883e35434 100644 --- a/Markdown/Markdown/Tokenizer/MdTokenizer.cs +++ b/Markdown/Markdown/Tokenizer/MdTokenizer.cs @@ -1,12 +1,245 @@ using System.Collections.ObjectModel; +using System.Runtime.InteropServices; using Markdown.AbstractSyntaxTree; namespace Markdown.Tokenizer; -public class MdTokenizer(ReadOnlyDictionary TokenAliases) : ITokenizer +public class MdTokenizer( + ReadOnlyDictionary StartTokenAliases, + ReadOnlyDictionary EndTokenAliases + ) : ITokenizer { public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tree, ReadOnlyMemory input) { - throw new NotImplementedException(); + ArgumentExceptionHelpers.ThrowIfFalse( + MemoryMarshal.TryGetString(input, out var str, out var start, out var length), + "Underlying object in the input argument is not a string"); + + var foundPlainText = false; + var plainTextStart = 0; + var increment = 1; + for (var i = start; i < length; ) + { + if (TryMatchTokenAliases(str!, i, tree, out var tokenType, out var tokenAlias, out var endToken)) + { + increment = tokenAlias.Length; + + if (endToken && tree.HasTokenInContext(tokenType)) + { + increment = tokenAlias.Length; + if (foundPlainText) + tree.EndToken(tokenType, input.Slice(plainTextStart, i - plainTextStart)); + else + { + tree.EndToken(tokenType); + tree.AddToken(TokenType.PlainText, input.Slice(i, tokenAlias.Length)); + } + foundPlainText = false; + } + else if (!endToken && !tree.HasTokenInContext(tokenType)) + { + if (tokenType != TokenType.Bold || !tree.HasTokenInContext(TokenType.Italic)) // двойное выделение не может быть внутри одинарного + { + if (foundPlainText) + tree.AddToken(TokenType.PlainText, input.Slice(plainTextStart, i - plainTextStart)); + tree.AddToken(tokenType, input.Slice(i, tokenAlias.Length)); + foundPlainText = false; + } + } + else + UpdatePlainTextState(ref foundPlainText, ref i, ref plainTextStart); + } + else + UpdatePlainTextState(ref foundPlainText, ref i, ref plainTextStart); + + i += increment; + if (increment > 1) + increment = 1; + } + + tree.EndToken(TokenType.Heading, + foundPlainText ? input.Slice(plainTextStart, str!.Length - plainTextStart) : null); + + return tree; + } + + private void UpdatePlainTextState(ref bool foundPlainText, ref int index, ref int plainTextStart) + { + if (!foundPlainText) + plainTextStart = index; + foundPlainText = true; + } + + private bool TryMatchTokenAliases( + string input, + int index, + IAbstractSyntaxTree tree, + out TokenType tokenType, + out string tokenAlias, + out bool endToken) + { + var matchedEndToken = false; + var mathcedStartToken = false; + + var startTokenType = default(TokenType); + var endTokenType = default(TokenType); + + endToken = true; + if (TryMatchTokenAliases(input, index, endToken, out var endTokenAlias)) + matchedEndToken = EndTokenAliases.TryGetValue(endTokenAlias, out endTokenType); + + endToken = false; + if (TryMatchTokenAliases(input, index, endToken, out var startTokenAlias) + && EnsureNotInSeparatedWords(input, index, startTokenAlias)) + mathcedStartToken = StartTokenAliases.TryGetValue(startTokenAlias, out startTokenType); + + if (matchedEndToken && mathcedStartToken) + { + if (startTokenAlias.Length > endTokenAlias.Length) + { + endToken = false; + tokenAlias = startTokenAlias; + tokenType = startTokenType; + return true; + } + + if (startTokenAlias.Length == endTokenAlias.Length) + { + // в этом случае startTokenType и endTokenType должны совпадать + if (tree.HasTokenInContext(startTokenType)) + { + endToken = true; + tokenAlias = endTokenAlias; + tokenType = endTokenType; + return true; + } + + endToken = false; + tokenAlias = startTokenAlias; + tokenType = startTokenType; + return true; + } + + endToken = true; + tokenAlias = endTokenAlias; + tokenType = endTokenType; + return true; + } + + if (matchedEndToken) + { + endToken = true; + tokenAlias = endTokenAlias; + tokenType = endTokenType; + return true; + } + + if (mathcedStartToken) + { + endToken = false; + tokenAlias = startTokenAlias; + tokenType = startTokenType; + return true; + } + + endToken = default; + tokenAlias = default; + tokenType = default; + return false; + } + + private bool TryMatchTokenAliases( + string input, + int index, + bool endToken, + out string tokenAlias) + { + tokenAlias = String.Empty; + var tokenAliases = endToken ? EndTokenAliases : StartTokenAliases; + foreach (var alias in tokenAliases.Keys) + { + if (TryMatchAlias(input, index, alias, endToken) && tokenAlias.Length < alias.Length) + tokenAlias = alias; + } + + return !String.IsNullOrEmpty(tokenAlias); + } + + /* + * для случаев типа: "сл_ово дру_гое слово", когда подчерки должны оставаться подчерками + */ + private bool EnsureNotInSeparatedWords(string input, int index, string alias) + { + if (!IsInsideAWord(input, index, alias)) + return true; + + var i = index + alias.Length; + while (i < input.Length && !IsWordDelimiter(input[i])) + { + if (TryMatchAlias(input, i, alias, true)) + return true; + i++; + } + + return false; + } + + private bool TryMatchAlias(string input, int index, string alias, bool isEndToken) + { + return TryMatchPattern(input, index, alias) + && (!IsBoldOrItalicAlias(alias, isEndToken) + || (IsInsideAWord(input, index, alias) + && !IsSurroundedByNumbers(input, index, alias)) + || HasAWhiteSpaceNearIt(input, index, alias, isEndToken)); + } + + private bool IsWordDelimiter(char c) + { + return c is ' ' or '\t' or '\n' or '\r'; + } + + private bool IsInsideAWord(string input, int index, string alias) + { + var trueForLeftEdge = index - 1 >= 0 && input[index - 1] != ' '; + var trueForRightEdge = index + alias.Length < input.Length + && input[index + alias.Length] != ' '; + return trueForLeftEdge && trueForRightEdge; + } + + private bool HasAWhiteSpaceNearIt(string input, int index, string alias, bool isEndToken) + { + var trueForLeftEdge = index - 1 < 0 || input[index - 1] == ' '; + var trueForRightEdge = index + alias.Length >= input.Length + || input[index + alias.Length] == ' '; + return isEndToken ? trueForRightEdge : trueForLeftEdge; + } + + private bool IsSurroundedByNumbers(string input, int index, string alias) + { + var trueForLeftEdge = index - 1 >= 0 + && int.TryParse(input[index - 1].ToString(), out _); + var trueForRightEdge = index + alias.Length < input.Length + && int.TryParse(input[index + alias.Length].ToString(), out _); + return trueForLeftEdge || trueForRightEdge; + } + + private bool IsBoldOrItalicAlias(string alias, bool isEndToken) + { + var tokenAliases = isEndToken ? EndTokenAliases : StartTokenAliases; + return tokenAliases[alias] == TokenType.Bold + || tokenAliases[alias] == TokenType.Italic; + } + + private bool TryMatchPattern(string input, int index, string pattern) + { + int i = 0; + foreach (var ch in pattern) + { + if (index + i >= input.Length || ch != input[index + i]) + return false; + i++; + } + + return true; } } \ No newline at end of file From 8de50816e3a3deba0cee27e26a611ecfc1045010 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Mon, 2 Dec 2024 20:45:24 +0500 Subject: [PATCH 08/26] addded tests for Md --- Markdown/Markdown.Tests/IMdTest.cs | 182 +++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 Markdown/Markdown.Tests/IMdTest.cs diff --git a/Markdown/Markdown.Tests/IMdTest.cs b/Markdown/Markdown.Tests/IMdTest.cs new file mode 100644 index 000000000..50880fa46 --- /dev/null +++ b/Markdown/Markdown.Tests/IMdTest.cs @@ -0,0 +1,182 @@ +using System; +using System.Diagnostics; +using System.Text; +using FluentAssertions; +using NUnit.Framework; + +namespace Markdown.Tests; + +[TestFixture] +[TestOf(typeof(IMd))] +public class IMdTest +{ + private IMd _imd; + + [SetUp] + public void SetUp() + { + _imd = new Md(); + } + + [Test] + [TestCase("", "")] + [TestCase("Hello world", "Hello world")] + [TestCase("Hello _world_!", "Hello world!")] + [TestCase("# _Hello_ __world__!", "

Hello world!

")] + public void Render_ReturnsCorrectMarkdown_ForSimpleCases( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This __text _contains_ nested__ markdown", "This text contains nested markdown")] + [TestCase("This is _an example __of inversed__ nested_ markdown", "This is an example __of inversed__ nested markdown")] + public void Render_ReturnsCorrectMarkdown_ForCasesWithNesting( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("Text_12_3", "Text_12_3")] + [TestCase("This _Text_12_3_ should be italic", "This Text_12_3 should be italic")] + [TestCase("5__12_3__4", "5__12_3__4")] + [TestCase("Text __that_12__3__ is in bold", "Text that_12__3 is in bold")] + public void Render_ReturnsCorrectMarkdown_ForTextWithNumbers( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("_begin_ning", "beginning")] + [TestCase("mi_ddl_e", "middle")] + [TestCase("end_ing_", "ending")] + [TestCase("__begin__ning", "beginning")] + [TestCase("mi__ddl__e", "middle")] + [TestCase("end__ing__", "ending")] + public void Render_ReturnsCorrectMarkdown_ForPartsOfWords( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This sh_ould not cha_nge", "This sh_ould not cha_nge")] + [TestCase("As w__ell a__s this", "As w__ell a__s this")] + [TestCase("This sh__o_uld_ wo__rk like this", "This sh__ould wo__rk like this")] + public void Render_ReturnsCorrectMarkdown_ForMarkdownInDifferentWords( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("__Unpaired_ markdown", "__Unpaired_ markdown")] + [TestCase("Another _unpaired markdown__", "Another _unpaired markdown__")] + public void Render_ReturnsCorrectMarkdown_ForUnpairedMarkdownSymbols( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This_ should not_ change", "This_ should not_ change")] + [TestCase("This _should _be in_ italics", "This should _be in italics")] + public void Render_ReturnsCorrectMarkdown_ForIncorrectlyPlacedUnderscores( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + // Это пока не работает + [Test] + [TestCase("Intersecting _markdown __should_ work__ like this", "Intersecting _markdown __should_ work__ like this")] + [TestCase("Another __example of _intersecting__ markdown_", "Another __example of _intersecting__ markdown_")] + public void Render_ReturnsCorrectMarkdown_ForIntersectingMarkdown( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This should ____ remain the same", "This should ____ remain the same")] + [TestCase("This also should __ not change", "This also should __ not change")] + public void Render_ReturnsCorrectMarkdown_ForEmptyMarkdown( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + // Это пока не работает + [Test] + [TestCase("This should \\_not turn\\_ into tags", "This should _not turn into tags")] + [TestCase("This should \\\\remain the\\\\ same", "This should \\\\remain the\\\\ same")] + public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + public void Render_PerformanceTest() + { + var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 100000); + Console.WriteLine($"Total length: {fullStr.Length}"); + + var totalTime = MeasureTime(fullStr); + Console.WriteLine($"Time elapsed in ms: {totalTime}"); + + totalTime + .Should() + .BeLessThan(1000); + } + + private long MeasureTime(string fullStr) + { + var sw = new Stopwatch(); + sw.Start(); + _imd.Render(fullStr); + sw.Stop(); + return sw.ElapsedMilliseconds; + } + + private string ArrangePerformanceTest(string input, int copyCount) + { + var sb = new StringBuilder(); + for (var i = 0; i < copyCount; i++) + sb.Append(input); + return sb.ToString(); + } +} \ No newline at end of file From 6e53af24a56129a20f423580d8c7cabf68a732dd Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Mon, 2 Dec 2024 20:46:56 +0500 Subject: [PATCH 09/26] deleted tree tests --- .../AbstractSyntaxTreeTests.cs | 128 ------------------ 1 file changed, 128 deletions(-) delete mode 100644 Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs diff --git a/Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs b/Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs deleted file mode 100644 index 88abd901a..000000000 --- a/Markdown/Markdown.Tests/AbstractSyntaxTree/AbstractSyntaxTreeTests.cs +++ /dev/null @@ -1,128 +0,0 @@ -using System; -using System.Collections.Generic; -using FluentAssertions; -using Markdown.AbstractSyntaxTree; -using NUnit.Framework; -using MdAbstractSyntaxTree = Markdown.AbstractSyntaxTree.MdAbstractSyntaxTree; - -namespace Markdown.Tests.AbstractSyntaxTree; - -[TestFixture] -[TestOf(typeof(MdAbstractSyntaxTree))] -public class AbstractSyntaxTreeTests -{ - private IAbstractSyntaxTree _syntaxTree; - - [SetUp] - public void SetUp() - { - var tags = new Dictionary(); - tags.Add(TokenType.Italic, "em"); - tags.Add(TokenType.Bold, "strong"); - tags.Add(TokenType.Heading, "h1"); - _syntaxTree = new MdAbstractSyntaxTree(tags.AsReadOnly()); - } - - [Test] - [Description("Проверяем, что метод AddToken кидает исключение если тип токена == PlainText" + - " и содержимое токена, переданное в аргумент имеет значение null")] - [TestCase(TokenType.PlainText, "", false)] - [TestCase(TokenType.PlainText, "some text", false)] - [TestCase(TokenType.PlainText, null, true)] - [TestCase(TokenType.Italic, "some text", false)] - [TestCase(TokenType.Italic, null, false)] - public void AddToken_ThrowsWhen_TokenValueIsNull_And_TokenTypeIsNotPlainText( - TokenType tokenType, - string value, - bool expectedException = true) - { - Action act = () => _syntaxTree.AddToken(tokenType, value?.AsMemory()); - - if (expectedException) - act.Should() - .Throw() - .WithMessage("tokenValue must not be null"); - else - act.Should().NotThrow(); - } - - [Test] - [Description("Проверяем, что набор токенов правильно переводится в текст")] - [TestCaseSource(nameof(AddTokenTestSource))] - public void TreeConvertsTokensToTextCorrectly(TestToken[] tokens, string expectedResult) - { - AddTokens(tokens); - - _syntaxTree.ToText() - .Should() - .Be(expectedResult); - } - - public readonly struct TestToken(TokenType tokenType, string value = null, bool endToken = false) - { - public readonly bool EndToken = endToken; - public readonly TokenType TokenType = tokenType; - public readonly string Value = value; - } - - private void AddTokens(TestToken[] tokens) - { - foreach (var token in tokens) - { - if (token.TokenType == TokenType.PlainText) - _syntaxTree.AddToken(token.TokenType, token.Value.AsMemory()); - else if (token.EndToken && token.TokenType == TokenType.Heading) - _syntaxTree.TryEndToken(TokenType.Heading); - else if (token.EndToken) - _syntaxTree.TryEndCurrentToken(); - else - _syntaxTree.AddToken(token.TokenType); - } - } - - public static object[] AddTokenTestSource = - { - new object[] - { - new TestToken[] - { - new TestToken(TokenType.PlainText, "Next "), - new TestToken(TokenType.Italic), - new TestToken(TokenType.PlainText, "word"), - new TestToken(TokenType.Italic, null, true), - new TestToken(TokenType.PlainText, " is italic") - }, - "Next word is italic" - }, - new object[] - { - new TestToken[] - { - new TestToken(TokenType.Heading), - new TestToken(TokenType.PlainText, "This text is a heading. And "), - new TestToken(TokenType.Italic), - new TestToken(TokenType.PlainText, "this"), - new TestToken(TokenType.Italic, null, true), - new TestToken(TokenType.PlainText, " word is italic. And "), - new TestToken(TokenType.Bold), - new TestToken(TokenType.PlainText, "these words"), - new TestToken(TokenType.Bold, null, true), - new TestToken(TokenType.PlainText, " are in bold.") - }, - "

This text is a heading. And this word is italic. And these words are in bold.

" - }, - new object[] - { - new TestToken[] - { - new TestToken(TokenType.Heading), - new TestToken(TokenType.PlainText, "This text is a heading. And "), - new TestToken(TokenType.Bold), - new TestToken(TokenType.PlainText, "these words are in bold."), - new TestToken(TokenType.Heading, null, true), - new TestToken(TokenType.PlainText, "\nThis text is after a heading.") - }, - "

This text is a heading. And these words are in bold.

\nThis text is after a heading." - } - }; -} \ No newline at end of file From b67b916f5be861f9ec48c21228456f50aaf1702f Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Mon, 2 Dec 2024 20:50:00 +0500 Subject: [PATCH 10/26] added usage example --- Markdown/Markdown/Program.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Markdown/Markdown/Program.cs b/Markdown/Markdown/Program.cs index 5f282702b..21b7b9395 100644 --- a/Markdown/Markdown/Program.cs +++ b/Markdown/Markdown/Program.cs @@ -1 +1,4 @@ - \ No newline at end of file +using Markdown; + +var md = new Md(); +Console.WriteLine(md.Render("# Hello World! _some words_ in italics\n __some other text__")); \ No newline at end of file From ae81615f2e747cf2967b8ba93b4659ea92356437 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 30 Nov 2024 13:30:55 +0500 Subject: [PATCH 11/26] implementation WIP 1 --- Markdown/Markdown.Tests/IMdTest.cs | 182 +++++++++++++ Markdown/Markdown.Tests/Markdown.Tests.csproj | 20 ++ Markdown/Markdown.sln | 6 + .../AbstractSyntaxTree/IAbstractSyntaxTree.cs | 10 + .../MdAbstractSyntaxTree.cs | 137 ++++++++++ Markdown/Markdown/ArgumentExceptionHelpers.cs | 16 ++ Markdown/Markdown/IMd.cs | 2 +- Markdown/Markdown/Md.cs | 66 +++-- Markdown/Markdown/Program.cs | 5 +- Markdown/Markdown/TokenType.cs | 9 + Markdown/Markdown/Tokenizer/ITokenizer.cs | 9 + Markdown/Markdown/Tokenizer/MdTokenizer.cs | 245 ++++++++++++++++++ 12 files changed, 669 insertions(+), 38 deletions(-) create mode 100644 Markdown/Markdown.Tests/IMdTest.cs create mode 100644 Markdown/Markdown.Tests/Markdown.Tests.csproj create mode 100644 Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs create mode 100644 Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs create mode 100644 Markdown/Markdown/ArgumentExceptionHelpers.cs create mode 100644 Markdown/Markdown/TokenType.cs create mode 100644 Markdown/Markdown/Tokenizer/ITokenizer.cs create mode 100644 Markdown/Markdown/Tokenizer/MdTokenizer.cs diff --git a/Markdown/Markdown.Tests/IMdTest.cs b/Markdown/Markdown.Tests/IMdTest.cs new file mode 100644 index 000000000..50880fa46 --- /dev/null +++ b/Markdown/Markdown.Tests/IMdTest.cs @@ -0,0 +1,182 @@ +using System; +using System.Diagnostics; +using System.Text; +using FluentAssertions; +using NUnit.Framework; + +namespace Markdown.Tests; + +[TestFixture] +[TestOf(typeof(IMd))] +public class IMdTest +{ + private IMd _imd; + + [SetUp] + public void SetUp() + { + _imd = new Md(); + } + + [Test] + [TestCase("", "")] + [TestCase("Hello world", "Hello world")] + [TestCase("Hello _world_!", "Hello world!")] + [TestCase("# _Hello_ __world__!", "

Hello world!

")] + public void Render_ReturnsCorrectMarkdown_ForSimpleCases( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This __text _contains_ nested__ markdown", "This text contains nested markdown")] + [TestCase("This is _an example __of inversed__ nested_ markdown", "This is an example __of inversed__ nested markdown")] + public void Render_ReturnsCorrectMarkdown_ForCasesWithNesting( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("Text_12_3", "Text_12_3")] + [TestCase("This _Text_12_3_ should be italic", "This Text_12_3 should be italic")] + [TestCase("5__12_3__4", "5__12_3__4")] + [TestCase("Text __that_12__3__ is in bold", "Text that_12__3 is in bold")] + public void Render_ReturnsCorrectMarkdown_ForTextWithNumbers( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("_begin_ning", "beginning")] + [TestCase("mi_ddl_e", "middle")] + [TestCase("end_ing_", "ending")] + [TestCase("__begin__ning", "beginning")] + [TestCase("mi__ddl__e", "middle")] + [TestCase("end__ing__", "ending")] + public void Render_ReturnsCorrectMarkdown_ForPartsOfWords( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This sh_ould not cha_nge", "This sh_ould not cha_nge")] + [TestCase("As w__ell a__s this", "As w__ell a__s this")] + [TestCase("This sh__o_uld_ wo__rk like this", "This sh__ould wo__rk like this")] + public void Render_ReturnsCorrectMarkdown_ForMarkdownInDifferentWords( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("__Unpaired_ markdown", "__Unpaired_ markdown")] + [TestCase("Another _unpaired markdown__", "Another _unpaired markdown__")] + public void Render_ReturnsCorrectMarkdown_ForUnpairedMarkdownSymbols( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This_ should not_ change", "This_ should not_ change")] + [TestCase("This _should _be in_ italics", "This should _be in italics")] + public void Render_ReturnsCorrectMarkdown_ForIncorrectlyPlacedUnderscores( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + // Это пока не работает + [Test] + [TestCase("Intersecting _markdown __should_ work__ like this", "Intersecting _markdown __should_ work__ like this")] + [TestCase("Another __example of _intersecting__ markdown_", "Another __example of _intersecting__ markdown_")] + public void Render_ReturnsCorrectMarkdown_ForIntersectingMarkdown( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + [TestCase("This should ____ remain the same", "This should ____ remain the same")] + [TestCase("This also should __ not change", "This also should __ not change")] + public void Render_ReturnsCorrectMarkdown_ForEmptyMarkdown( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + // Это пока не работает + [Test] + [TestCase("This should \\_not turn\\_ into tags", "This should _not turn into tags")] + [TestCase("This should \\\\remain the\\\\ same", "This should \\\\remain the\\\\ same")] + public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( + string input, + string expectedOutput) + { + _imd.Render(input) + .Should() + .Be(expectedOutput); + } + + [Test] + public void Render_PerformanceTest() + { + var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 100000); + Console.WriteLine($"Total length: {fullStr.Length}"); + + var totalTime = MeasureTime(fullStr); + Console.WriteLine($"Time elapsed in ms: {totalTime}"); + + totalTime + .Should() + .BeLessThan(1000); + } + + private long MeasureTime(string fullStr) + { + var sw = new Stopwatch(); + sw.Start(); + _imd.Render(fullStr); + sw.Stop(); + return sw.ElapsedMilliseconds; + } + + private string ArrangePerformanceTest(string input, int copyCount) + { + var sb = new StringBuilder(); + for (var i = 0; i < copyCount; i++) + sb.Append(input); + return sb.ToString(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown.Tests/Markdown.Tests.csproj b/Markdown/Markdown.Tests/Markdown.Tests.csproj new file mode 100644 index 000000000..94db3bb71 --- /dev/null +++ b/Markdown/Markdown.Tests/Markdown.Tests.csproj @@ -0,0 +1,20 @@ + + + + net8.0 + + false + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Markdown/Markdown.sln b/Markdown/Markdown.sln index 18722da42..bd31bed26 100644 --- a/Markdown/Markdown.sln +++ b/Markdown/Markdown.sln @@ -2,6 +2,8 @@ Microsoft Visual Studio Solution File, Format Version 12.00 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown", "Markdown\Markdown.csproj", "{B8FD8A48-C2C3-434B-953F-B9AF324E3E95}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown.Tests", "Markdown.Tests\Markdown.Tests.csproj", "{0B1D2315-E457-4F38-92C9-5BC11A8752B6}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -12,5 +14,9 @@ Global {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Debug|Any CPU.Build.0 = Debug|Any CPU {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Release|Any CPU.ActiveCfg = Release|Any CPU {B8FD8A48-C2C3-434B-953F-B9AF324E3E95}.Release|Any CPU.Build.0 = Release|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs new file mode 100644 index 000000000..c84cc8255 --- /dev/null +++ b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs @@ -0,0 +1,10 @@ +namespace Markdown.AbstractSyntaxTree; + +public interface IAbstractSyntaxTree +where TTokenType : Enum +{ + public void AddToken(TTokenType tokenType, ReadOnlyMemory tokenValue); + public bool HasTokenInContext(TTokenType tokenType); + public void EndToken(TTokenType tokenType, ReadOnlyMemory? tokenValue = null); + public string ToText(); +} \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs new file mode 100644 index 000000000..1f6d4a44b --- /dev/null +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -0,0 +1,137 @@ +using System.Collections.ObjectModel; +using System.Text; + +namespace Markdown.AbstractSyntaxTree; + +public class MdAbstractSyntaxTree : IAbstractSyntaxTree +{ + private class Node + { + public Node() + { + Children = new List(); + } + + public Node(TokenType tokenType, ReadOnlyMemory? tokenValue) + { + TokenType = tokenType; + TokenValue = tokenValue; + Children = new List(); + } + + public TokenType? TokenType { get; } + public ReadOnlyMemory? TokenValue { get; } + public Node? Parent { get; private set; } + private List Children { get; } + + public void AddChild(Node node) + { + node.Parent = this; + Children.Add(node); + } + + public IEnumerable RemoveChildren() + { + var children = new List(GetChildren()); + Children.Clear(); + return children; + } + + public IEnumerable GetChildren() => Children.AsReadOnly(); + } + + private readonly ReadOnlyDictionary _tokenTags; + private readonly Node _root; + private Node _current; + + public MdAbstractSyntaxTree(ReadOnlyDictionary tokenTags) + { + _tokenTags = tokenTags; + _root = new Node(); + _current = _root; + } + + public void AddToken(TokenType tokenType, ReadOnlyMemory tokenValue) + { + ArgumentExceptionHelpers.ThrowIfNull(tokenValue, "tokenValue must not be null"); + if (tokenType == TokenType.PlainText) + { + _current.AddChild(new Node(tokenType, tokenValue)); + } + else + { + var newNode = new Node(tokenType, tokenValue); + _current.AddChild(newNode); + _current = newNode; + } + } + + public bool HasTokenInContext(TokenType tokenType) => HasParent(tokenType, _current); + + private bool HasParent(TokenType tokenType, Node node) + { + if (node == _root) + return false; + if (node.TokenType == tokenType) + return true; + return HasParent(tokenType, node.Parent!); + } + + public void EndToken(TokenType tokenType, ReadOnlyMemory? tokenValue = null) + { + WalkUpToTheRoot(_current, tokenType, tokenValue); + } + + private void WalkUpToTheRoot(Node node, TokenType tokenType, ReadOnlyMemory? tokenValue) + { + if (node == _root) + { + AddTextToNodeAndMakeCurrent(node, tokenValue); + } + else if (node.TokenType == tokenType) + { + AddTextToNodeAndMakeCurrent(node, tokenValue); + _current = node.Parent!; + } + else + { + var parent = node.Parent!; + var children = node.RemoveChildren(); + foreach (var child in children) + parent.AddChild(child); + WalkUpToTheRoot(parent, tokenType, tokenValue); + } + } + + private void AddTextToNodeAndMakeCurrent(Node node, ReadOnlyMemory? tokenValue) + { + _current = node; + if (tokenValue != null) + AddToken(TokenType.PlainText, tokenValue.Value); + } + + public string ToText() + { + var sb = new StringBuilder(); + ProcessChildren(_root, sb); + return sb.ToString(); + } + + private void ProcessChildren(Node node, StringBuilder sb) + { + foreach (var child in node.GetChildren()) + { + if (child.TokenType == TokenType.PlainText || !child.GetChildren().Any()) + sb.Append(child.TokenValue); + else + SurroundWithTag(_tokenTags[child.TokenType!.Value], child, sb); + } + } + + private void SurroundWithTag(string tag, Node node, StringBuilder sb) + { + sb.Append($"<{tag}>"); + ProcessChildren(node, sb); + sb.Append($""); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/ArgumentExceptionHelpers.cs b/Markdown/Markdown/ArgumentExceptionHelpers.cs new file mode 100644 index 000000000..2fb150dc7 --- /dev/null +++ b/Markdown/Markdown/ArgumentExceptionHelpers.cs @@ -0,0 +1,16 @@ +namespace Markdown; + +public static class ArgumentExceptionHelpers +{ + public static void ThrowIfFalse(bool flag, string message) + { + if (!flag) + throw new ArgumentException(message); + } + + public static void ThrowIfNull(object? obj, string message) + { + if (obj == null) + throw new ArgumentException(message); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/IMd.cs b/Markdown/Markdown/IMd.cs index 2e029cbf3..c77626d39 100644 --- a/Markdown/Markdown/IMd.cs +++ b/Markdown/Markdown/IMd.cs @@ -2,5 +2,5 @@ namespace Markdown; public interface IMd { - public string Render(string markdown); + public string Render(string input); } \ No newline at end of file diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs index 1652cb2b0..9bad90d2e 100644 --- a/Markdown/Markdown/Md.cs +++ b/Markdown/Markdown/Md.cs @@ -1,46 +1,40 @@ -using System.Text; -using Markdown.TokenConverters; -using Markdown.Tokenizers; -using Markdown.Tokens; +using System.Collections.ObjectModel; +using Markdown.AbstractSyntaxTree; +using Markdown.Tokenizer; namespace Markdown; public class Md : IMd { - private readonly ITokenizer[] _tokenizers = - { - new BoldTokenizer(), - new ItalicTokenizer(), - new HeadingTokenizer() - }; - private readonly ITokenConverter[] _converters = + private readonly ReadOnlyDictionary _tokenAliases; + private readonly ReadOnlyDictionary _endTokenAliases; + private readonly ReadOnlyDictionary _tokenTags; + + public Md() { - new BoldHtmlConverter(), - new ItalicHtmlConverter(), - new HeadingHtmlConverter() - }; - - private readonly int _sliceSize = 1024; + var tokenAliases = new Dictionary(); + tokenAliases.Add("_", TokenType.Italic); + tokenAliases.Add("__", TokenType.Bold); + tokenAliases.Add("# ", TokenType.Heading); + _tokenAliases = tokenAliases.AsReadOnly(); + + var endTokenAliases = new Dictionary(); + endTokenAliases.Add("_", TokenType.Italic); + endTokenAliases.Add("__", TokenType.Bold); + endTokenAliases.Add("\n", TokenType.Heading); + _endTokenAliases = endTokenAliases.AsReadOnly(); + + var tokenTags = new Dictionary(); + tokenTags.Add(TokenType.Italic, "em"); + tokenTags.Add(TokenType.Bold, "strong"); + tokenTags.Add(TokenType.Heading, "h1"); + _tokenTags = tokenTags.AsReadOnly(); + } - public string Render(string markdown) + public string Render(string input) { - var markdownSpan = markdown.AsSpan(); - var context = new StringBuilder(); - var stepCount = markdownSpan.Length / _sliceSize; - stepCount = markdownSpan.Length % _sliceSize == 0 ? stepCount : stepCount + 1; - var tokenList = new List(); - for (var step = 0; step < stepCount; step++) - { - var sliceStart = step * _sliceSize; - var sliceSize = Math.Min(_sliceSize, markdownSpan.Length - sliceStart); - var stepSpan = markdownSpan.Slice(sliceStart, sliceSize); - foreach (var tokenizer in _tokenizers) - tokenList.AddRange(tokenizer.Tokenize(stepSpan)); - foreach (var converter in _converters) - converter.ProcessTokens(tokenList, context); - tokenList.Clear(); - } - - return context.ToString(); + var tokenizer = new MdTokenizer(_tokenAliases, _endTokenAliases); + var syntaxTree = tokenizer.Tokenize(new MdAbstractSyntaxTree(_tokenTags), input.AsMemory()); + return syntaxTree.ToText(); } } \ No newline at end of file diff --git a/Markdown/Markdown/Program.cs b/Markdown/Markdown/Program.cs index 5f282702b..21b7b9395 100644 --- a/Markdown/Markdown/Program.cs +++ b/Markdown/Markdown/Program.cs @@ -1 +1,4 @@ - \ No newline at end of file +using Markdown; + +var md = new Md(); +Console.WriteLine(md.Render("# Hello World! _some words_ in italics\n __some other text__")); \ No newline at end of file diff --git a/Markdown/Markdown/TokenType.cs b/Markdown/Markdown/TokenType.cs new file mode 100644 index 000000000..5816793d9 --- /dev/null +++ b/Markdown/Markdown/TokenType.cs @@ -0,0 +1,9 @@ +namespace Markdown; + +public enum TokenType +{ + PlainText, + Italic, + Bold, + Heading +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/ITokenizer.cs b/Markdown/Markdown/Tokenizer/ITokenizer.cs new file mode 100644 index 000000000..721ac5893 --- /dev/null +++ b/Markdown/Markdown/Tokenizer/ITokenizer.cs @@ -0,0 +1,9 @@ +using Markdown.AbstractSyntaxTree; + +namespace Markdown.Tokenizer; + +public interface ITokenizer +where TTokenType : Enum +{ + public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tree, ReadOnlyMemory input); +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer.cs b/Markdown/Markdown/Tokenizer/MdTokenizer.cs new file mode 100644 index 000000000..883e35434 --- /dev/null +++ b/Markdown/Markdown/Tokenizer/MdTokenizer.cs @@ -0,0 +1,245 @@ +using System.Collections.ObjectModel; +using System.Runtime.InteropServices; +using Markdown.AbstractSyntaxTree; + +namespace Markdown.Tokenizer; + +public class MdTokenizer( + ReadOnlyDictionary StartTokenAliases, + ReadOnlyDictionary EndTokenAliases + ) : ITokenizer +{ + public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tree, ReadOnlyMemory input) + { + ArgumentExceptionHelpers.ThrowIfFalse( + MemoryMarshal.TryGetString(input, out var str, out var start, out var length), + "Underlying object in the input argument is not a string"); + + var foundPlainText = false; + var plainTextStart = 0; + var increment = 1; + for (var i = start; i < length; ) + { + if (TryMatchTokenAliases(str!, i, tree, out var tokenType, out var tokenAlias, out var endToken)) + { + increment = tokenAlias.Length; + + if (endToken && tree.HasTokenInContext(tokenType)) + { + increment = tokenAlias.Length; + if (foundPlainText) + tree.EndToken(tokenType, input.Slice(plainTextStart, i - plainTextStart)); + else + { + tree.EndToken(tokenType); + tree.AddToken(TokenType.PlainText, input.Slice(i, tokenAlias.Length)); + } + foundPlainText = false; + } + else if (!endToken && !tree.HasTokenInContext(tokenType)) + { + if (tokenType != TokenType.Bold || !tree.HasTokenInContext(TokenType.Italic)) // двойное выделение не может быть внутри одинарного + { + if (foundPlainText) + tree.AddToken(TokenType.PlainText, input.Slice(plainTextStart, i - plainTextStart)); + tree.AddToken(tokenType, input.Slice(i, tokenAlias.Length)); + foundPlainText = false; + } + } + else + UpdatePlainTextState(ref foundPlainText, ref i, ref plainTextStart); + } + else + UpdatePlainTextState(ref foundPlainText, ref i, ref plainTextStart); + + i += increment; + if (increment > 1) + increment = 1; + } + + tree.EndToken(TokenType.Heading, + foundPlainText ? input.Slice(plainTextStart, str!.Length - plainTextStart) : null); + + return tree; + } + + private void UpdatePlainTextState(ref bool foundPlainText, ref int index, ref int plainTextStart) + { + if (!foundPlainText) + plainTextStart = index; + foundPlainText = true; + } + + private bool TryMatchTokenAliases( + string input, + int index, + IAbstractSyntaxTree tree, + out TokenType tokenType, + out string tokenAlias, + out bool endToken) + { + var matchedEndToken = false; + var mathcedStartToken = false; + + var startTokenType = default(TokenType); + var endTokenType = default(TokenType); + + endToken = true; + if (TryMatchTokenAliases(input, index, endToken, out var endTokenAlias)) + matchedEndToken = EndTokenAliases.TryGetValue(endTokenAlias, out endTokenType); + + endToken = false; + if (TryMatchTokenAliases(input, index, endToken, out var startTokenAlias) + && EnsureNotInSeparatedWords(input, index, startTokenAlias)) + mathcedStartToken = StartTokenAliases.TryGetValue(startTokenAlias, out startTokenType); + + if (matchedEndToken && mathcedStartToken) + { + if (startTokenAlias.Length > endTokenAlias.Length) + { + endToken = false; + tokenAlias = startTokenAlias; + tokenType = startTokenType; + return true; + } + + if (startTokenAlias.Length == endTokenAlias.Length) + { + // в этом случае startTokenType и endTokenType должны совпадать + if (tree.HasTokenInContext(startTokenType)) + { + endToken = true; + tokenAlias = endTokenAlias; + tokenType = endTokenType; + return true; + } + + endToken = false; + tokenAlias = startTokenAlias; + tokenType = startTokenType; + return true; + } + + endToken = true; + tokenAlias = endTokenAlias; + tokenType = endTokenType; + return true; + } + + if (matchedEndToken) + { + endToken = true; + tokenAlias = endTokenAlias; + tokenType = endTokenType; + return true; + } + + if (mathcedStartToken) + { + endToken = false; + tokenAlias = startTokenAlias; + tokenType = startTokenType; + return true; + } + + endToken = default; + tokenAlias = default; + tokenType = default; + return false; + } + + private bool TryMatchTokenAliases( + string input, + int index, + bool endToken, + out string tokenAlias) + { + tokenAlias = String.Empty; + var tokenAliases = endToken ? EndTokenAliases : StartTokenAliases; + foreach (var alias in tokenAliases.Keys) + { + if (TryMatchAlias(input, index, alias, endToken) && tokenAlias.Length < alias.Length) + tokenAlias = alias; + } + + return !String.IsNullOrEmpty(tokenAlias); + } + + /* + * для случаев типа: "сл_ово дру_гое слово", когда подчерки должны оставаться подчерками + */ + private bool EnsureNotInSeparatedWords(string input, int index, string alias) + { + if (!IsInsideAWord(input, index, alias)) + return true; + + var i = index + alias.Length; + while (i < input.Length && !IsWordDelimiter(input[i])) + { + if (TryMatchAlias(input, i, alias, true)) + return true; + i++; + } + + return false; + } + + private bool TryMatchAlias(string input, int index, string alias, bool isEndToken) + { + return TryMatchPattern(input, index, alias) + && (!IsBoldOrItalicAlias(alias, isEndToken) + || (IsInsideAWord(input, index, alias) + && !IsSurroundedByNumbers(input, index, alias)) + || HasAWhiteSpaceNearIt(input, index, alias, isEndToken)); + } + + private bool IsWordDelimiter(char c) + { + return c is ' ' or '\t' or '\n' or '\r'; + } + + private bool IsInsideAWord(string input, int index, string alias) + { + var trueForLeftEdge = index - 1 >= 0 && input[index - 1] != ' '; + var trueForRightEdge = index + alias.Length < input.Length + && input[index + alias.Length] != ' '; + return trueForLeftEdge && trueForRightEdge; + } + + private bool HasAWhiteSpaceNearIt(string input, int index, string alias, bool isEndToken) + { + var trueForLeftEdge = index - 1 < 0 || input[index - 1] == ' '; + var trueForRightEdge = index + alias.Length >= input.Length + || input[index + alias.Length] == ' '; + return isEndToken ? trueForRightEdge : trueForLeftEdge; + } + + private bool IsSurroundedByNumbers(string input, int index, string alias) + { + var trueForLeftEdge = index - 1 >= 0 + && int.TryParse(input[index - 1].ToString(), out _); + var trueForRightEdge = index + alias.Length < input.Length + && int.TryParse(input[index + alias.Length].ToString(), out _); + return trueForLeftEdge || trueForRightEdge; + } + + private bool IsBoldOrItalicAlias(string alias, bool isEndToken) + { + var tokenAliases = isEndToken ? EndTokenAliases : StartTokenAliases; + return tokenAliases[alias] == TokenType.Bold + || tokenAliases[alias] == TokenType.Italic; + } + + private bool TryMatchPattern(string input, int index, string pattern) + { + int i = 0; + foreach (var ch in pattern) + { + if (index + i >= input.Length || ch != input[index + i]) + return false; + i++; + } + + return true; + } +} \ No newline at end of file From 9005981d6024bd084e207e1682a1303d21184884 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Wed, 4 Dec 2024 16:52:22 +0500 Subject: [PATCH 12/26] renamed IMdTest to IMdTests --- Markdown/Markdown.Tests/{IMdTest.cs => IMdTests.cs} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename Markdown/Markdown.Tests/{IMdTest.cs => IMdTests.cs} (99%) diff --git a/Markdown/Markdown.Tests/IMdTest.cs b/Markdown/Markdown.Tests/IMdTests.cs similarity index 99% rename from Markdown/Markdown.Tests/IMdTest.cs rename to Markdown/Markdown.Tests/IMdTests.cs index 50880fa46..7c8bdf26f 100644 --- a/Markdown/Markdown.Tests/IMdTest.cs +++ b/Markdown/Markdown.Tests/IMdTests.cs @@ -8,7 +8,7 @@ namespace Markdown.Tests; [TestFixture] [TestOf(typeof(IMd))] -public class IMdTest +public class IMdTests { private IMd _imd; From 1ef1aa0a68c0efeafc432a86b8d46c3030a42bc0 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 7 Dec 2024 00:11:43 +0500 Subject: [PATCH 13/26] renamed files for refactoring --- .../MdAbstractSyntaxTree.cs | 137 ---------------- .../MdAbstractSyntaxTree_.cs | 155 ++++++++++++++++++ .../{TokenType.cs => Token/MdTokenType.cs} | 2 +- .../{MdTokenizer.cs => MdTokenizer_.cs} | 105 +++++------- 4 files changed, 195 insertions(+), 204 deletions(-) delete mode 100644 Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs create mode 100644 Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs rename Markdown/Markdown/{TokenType.cs => Token/MdTokenType.cs} (75%) rename Markdown/Markdown/Tokenizer/{MdTokenizer.cs => MdTokenizer_.cs} (65%) diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs deleted file mode 100644 index 1f6d4a44b..000000000 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs +++ /dev/null @@ -1,137 +0,0 @@ -using System.Collections.ObjectModel; -using System.Text; - -namespace Markdown.AbstractSyntaxTree; - -public class MdAbstractSyntaxTree : IAbstractSyntaxTree -{ - private class Node - { - public Node() - { - Children = new List(); - } - - public Node(TokenType tokenType, ReadOnlyMemory? tokenValue) - { - TokenType = tokenType; - TokenValue = tokenValue; - Children = new List(); - } - - public TokenType? TokenType { get; } - public ReadOnlyMemory? TokenValue { get; } - public Node? Parent { get; private set; } - private List Children { get; } - - public void AddChild(Node node) - { - node.Parent = this; - Children.Add(node); - } - - public IEnumerable RemoveChildren() - { - var children = new List(GetChildren()); - Children.Clear(); - return children; - } - - public IEnumerable GetChildren() => Children.AsReadOnly(); - } - - private readonly ReadOnlyDictionary _tokenTags; - private readonly Node _root; - private Node _current; - - public MdAbstractSyntaxTree(ReadOnlyDictionary tokenTags) - { - _tokenTags = tokenTags; - _root = new Node(); - _current = _root; - } - - public void AddToken(TokenType tokenType, ReadOnlyMemory tokenValue) - { - ArgumentExceptionHelpers.ThrowIfNull(tokenValue, "tokenValue must not be null"); - if (tokenType == TokenType.PlainText) - { - _current.AddChild(new Node(tokenType, tokenValue)); - } - else - { - var newNode = new Node(tokenType, tokenValue); - _current.AddChild(newNode); - _current = newNode; - } - } - - public bool HasTokenInContext(TokenType tokenType) => HasParent(tokenType, _current); - - private bool HasParent(TokenType tokenType, Node node) - { - if (node == _root) - return false; - if (node.TokenType == tokenType) - return true; - return HasParent(tokenType, node.Parent!); - } - - public void EndToken(TokenType tokenType, ReadOnlyMemory? tokenValue = null) - { - WalkUpToTheRoot(_current, tokenType, tokenValue); - } - - private void WalkUpToTheRoot(Node node, TokenType tokenType, ReadOnlyMemory? tokenValue) - { - if (node == _root) - { - AddTextToNodeAndMakeCurrent(node, tokenValue); - } - else if (node.TokenType == tokenType) - { - AddTextToNodeAndMakeCurrent(node, tokenValue); - _current = node.Parent!; - } - else - { - var parent = node.Parent!; - var children = node.RemoveChildren(); - foreach (var child in children) - parent.AddChild(child); - WalkUpToTheRoot(parent, tokenType, tokenValue); - } - } - - private void AddTextToNodeAndMakeCurrent(Node node, ReadOnlyMemory? tokenValue) - { - _current = node; - if (tokenValue != null) - AddToken(TokenType.PlainText, tokenValue.Value); - } - - public string ToText() - { - var sb = new StringBuilder(); - ProcessChildren(_root, sb); - return sb.ToString(); - } - - private void ProcessChildren(Node node, StringBuilder sb) - { - foreach (var child in node.GetChildren()) - { - if (child.TokenType == TokenType.PlainText || !child.GetChildren().Any()) - sb.Append(child.TokenValue); - else - SurroundWithTag(_tokenTags[child.TokenType!.Value], child, sb); - } - } - - private void SurroundWithTag(string tag, Node node, StringBuilder sb) - { - sb.Append($"<{tag}>"); - ProcessChildren(node, sb); - sb.Append($""); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs new file mode 100644 index 000000000..5ff02a7cb --- /dev/null +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs @@ -0,0 +1,155 @@ +using System.Collections.ObjectModel; +using System.Text; +using Markdown.Token; + +namespace Markdown.AbstractSyntaxTree; + +public class MdAbstractSyntaxTree +{ + public class Node + { + public Node() + { + Children = new List(); + } + + public Node(MdTokenType tokenType, ReadOnlyMemory? tokenValue) + { + TokenType = tokenType; + TokenValue = tokenValue; + Children = new List(); + } + + public MdTokenType? TokenType { get; set; } + public ReadOnlyMemory? TokenValue { get; } + public Node? Parent { get; set; } + public List Children { get; } + + public void AddChild(Node node) + { + node.Parent = this; + Children.Add(node); + } + + public IEnumerable RemoveChildren() + { + var children = new List(Children); + Children.Clear(); + return children; + } + } + + private readonly ReadOnlyDictionary _tokenTags; + private readonly Node _root; + private Node _current; + + public MdAbstractSyntaxTree(ReadOnlyDictionary tokenTags) + { + _tokenTags = tokenTags; + _root = new Node(); + _current = _root; + } + + public void AddToken(MdTokenType mdTokenType, ReadOnlyMemory tokenValue) + { + ArgumentExceptionHelpers.ThrowIfNull(tokenValue, "tokenValue must not be null"); + if (mdTokenType == MdTokenType.PlainText) + { + _current.AddChild(new Node(mdTokenType, tokenValue)); + } + else + { + var newNode = new Node(mdTokenType, tokenValue); + _current.AddChild(newNode); + _current = newNode; + } + } + + public bool HasTokenInContext(MdTokenType mdTokenType) => HasParent(mdTokenType, _current); + + private bool HasParent(MdTokenType mdTokenType, Node node) + { + if (node == _root) + return false; + if (node.TokenType == mdTokenType) + return true; + return HasParent(mdTokenType, node.Parent!); + } + + public void EndToken(MdTokenType? mdTokenType = null) + { + WalkUpToTheRoot(_current, mdTokenType); + } + + public List GetChildrenForCurrentToken() + { + return _current.Children; + } + + private void WalkUpToTheRoot(Node node, MdTokenType? mdTokenType) + { + if (node == _root) + _current = _root; + else if (node.TokenType == mdTokenType) + { + if (node.TokenType == MdTokenType.Italic) + { + var childNodes = new List(node.Children); + for (var i = 0; i < childNodes.Count; i++) + { + var childNode = childNodes[i]; + if (childNode.TokenType == MdTokenType.Bold) + { + var children = childNode.RemoveChildren().ToList(); + foreach (var child in children) + { + node.Children.Insert(i + 1, child); + child.Parent = node; + } + childNode.TokenType = MdTokenType.PlainText; + node.Children.Insert(i + 1 + children.Count, childNode); + } + } + } + _current = node.Parent!; + } + else + { + var parent = node.Parent!; + var children = node.RemoveChildren(); + foreach (var child in children) + parent.AddChild(child); + WalkUpToTheRoot(parent, mdTokenType); + } + } + + public void ReviseForIntersection(MdTokenType tokenType, ReadOnlyMemory tokenText) + { + throw new NotImplementedException(); + } + + public string ToText() + { + var sb = new StringBuilder(); + ProcessChildren(_root, sb); + return sb.ToString(); + } + + private void ProcessChildren(Node node, StringBuilder sb) + { + foreach (var child in node.Children) + { + if (child.TokenType == MdTokenType.PlainText || child.Children.Count == 0) + sb.Append(child.TokenValue); + else + SurroundWithTag(_tokenTags[child.TokenType!.Value], child, sb); + } + } + + private void SurroundWithTag(string tag, Node node, StringBuilder sb) + { + sb.Append($"<{tag}>"); + ProcessChildren(node, sb); + sb.Append($""); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/TokenType.cs b/Markdown/Markdown/Token/MdTokenType.cs similarity index 75% rename from Markdown/Markdown/TokenType.cs rename to Markdown/Markdown/Token/MdTokenType.cs index 5816793d9..39c6cbddd 100644 --- a/Markdown/Markdown/TokenType.cs +++ b/Markdown/Markdown/Token/MdTokenType.cs @@ -1,6 +1,6 @@ namespace Markdown; -public enum TokenType +public enum MdTokenType { PlainText, Italic, diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer.cs b/Markdown/Markdown/Tokenizer/MdTokenizer_.cs similarity index 65% rename from Markdown/Markdown/Tokenizer/MdTokenizer.cs rename to Markdown/Markdown/Tokenizer/MdTokenizer_.cs index 883e35434..7a56ed175 100644 --- a/Markdown/Markdown/Tokenizer/MdTokenizer.cs +++ b/Markdown/Markdown/Tokenizer/MdTokenizer_.cs @@ -1,15 +1,15 @@ using System.Collections.ObjectModel; using System.Runtime.InteropServices; -using Markdown.AbstractSyntaxTree; +using Markdown.Token; namespace Markdown.Tokenizer; public class MdTokenizer( - ReadOnlyDictionary StartTokenAliases, - ReadOnlyDictionary EndTokenAliases - ) : ITokenizer + ReadOnlyDictionary StartTokenAliases, + ReadOnlyDictionary EndTokenAliases + ) { - public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tree, ReadOnlyMemory input) + public IEnumerable Tokenize(ReadOnlyMemory input) { ArgumentExceptionHelpers.ThrowIfFalse( MemoryMarshal.TryGetString(input, out var str, out var start, out var length), @@ -20,34 +20,17 @@ public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tr var increment = 1; for (var i = start; i < length; ) { - if (TryMatchTokenAliases(str!, i, tree, out var tokenType, out var tokenAlias, out var endToken)) + if (TryMatchTokenAliases(str!, i, out var tokenType, out var tokenAlias, out var tokenBehaviour)) { increment = tokenAlias.Length; - if (endToken && tree.HasTokenInContext(tokenType)) - { - increment = tokenAlias.Length; - if (foundPlainText) - tree.EndToken(tokenType, input.Slice(plainTextStart, i - plainTextStart)); - else - { - tree.EndToken(tokenType); - tree.AddToken(TokenType.PlainText, input.Slice(i, tokenAlias.Length)); - } - foundPlainText = false; - } - else if (!endToken && !tree.HasTokenInContext(tokenType)) - { - if (tokenType != TokenType.Bold || !tree.HasTokenInContext(TokenType.Italic)) // двойное выделение не может быть внутри одинарного - { - if (foundPlainText) - tree.AddToken(TokenType.PlainText, input.Slice(plainTextStart, i - plainTextStart)); - tree.AddToken(tokenType, input.Slice(i, tokenAlias.Length)); - foundPlainText = false; - } - } - else - UpdatePlainTextState(ref foundPlainText, ref i, ref plainTextStart); + if (foundPlainText) + yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, + input.Slice(plainTextStart, i - plainTextStart)); + + yield return new MdToken(tokenType, tokenBehaviour, input.Slice(i, tokenAlias.Length)); + + foundPlainText = false; } else UpdatePlainTextState(ref foundPlainText, ref i, ref plainTextStart); @@ -57,10 +40,9 @@ public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tr increment = 1; } - tree.EndToken(TokenType.Heading, - foundPlainText ? input.Slice(plainTextStart, str!.Length - plainTextStart) : null); - - return tree; + if (foundPlainText) + yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, + input.Slice(plainTextStart, str!.Length - plainTextStart)); } private void UpdatePlainTextState(ref bool foundPlainText, ref int index, ref int plainTextStart) @@ -73,18 +55,17 @@ private void UpdatePlainTextState(ref bool foundPlainText, ref int index, ref in private bool TryMatchTokenAliases( string input, int index, - IAbstractSyntaxTree tree, - out TokenType tokenType, + out MdTokenType mdTokenType, out string tokenAlias, - out bool endToken) + out MdTokenBehaviour behaviour) { var matchedEndToken = false; var mathcedStartToken = false; - var startTokenType = default(TokenType); - var endTokenType = default(TokenType); + var startTokenType = default(MdTokenType); + var endTokenType = default(MdTokenType); - endToken = true; + var endToken = true; if (TryMatchTokenAliases(input, index, endToken, out var endTokenAlias)) matchedEndToken = EndTokenAliases.TryGetValue(endTokenAlias, out endTokenType); @@ -92,59 +73,51 @@ private bool TryMatchTokenAliases( if (TryMatchTokenAliases(input, index, endToken, out var startTokenAlias) && EnsureNotInSeparatedWords(input, index, startTokenAlias)) mathcedStartToken = StartTokenAliases.TryGetValue(startTokenAlias, out startTokenType); - + + // это случается если элемент разметки находится внутри слова или стоит отдельно: "слово __ слово" if (matchedEndToken && mathcedStartToken) { if (startTokenAlias.Length > endTokenAlias.Length) { - endToken = false; + behaviour = MdTokenBehaviour.Opening; tokenAlias = startTokenAlias; - tokenType = startTokenType; + mdTokenType = startTokenType; return true; } - + if (startTokenAlias.Length == endTokenAlias.Length) { - // в этом случае startTokenType и endTokenType должны совпадать - if (tree.HasTokenInContext(startTokenType)) - { - endToken = true; - tokenAlias = endTokenAlias; - tokenType = endTokenType; - return true; - } - - endToken = false; + behaviour = MdTokenBehaviour.Undefined; tokenAlias = startTokenAlias; - tokenType = startTokenType; + mdTokenType = startTokenType; return true; } - endToken = true; + behaviour = MdTokenBehaviour.Closing; tokenAlias = endTokenAlias; - tokenType = endTokenType; + mdTokenType = endTokenType; return true; } if (matchedEndToken) { - endToken = true; + behaviour = MdTokenBehaviour.Closing; tokenAlias = endTokenAlias; - tokenType = endTokenType; + mdTokenType = endTokenType; return true; } if (mathcedStartToken) { - endToken = false; + behaviour = MdTokenBehaviour.Opening; tokenAlias = startTokenAlias; - tokenType = startTokenType; + mdTokenType = startTokenType; return true; } - endToken = default; - tokenAlias = default; - tokenType = default; + behaviour = MdTokenBehaviour.Undefined; + tokenAlias = String.Empty; + mdTokenType = MdTokenType.PlainText; return false; } @@ -226,8 +199,8 @@ private bool IsSurroundedByNumbers(string input, int index, string alias) private bool IsBoldOrItalicAlias(string alias, bool isEndToken) { var tokenAliases = isEndToken ? EndTokenAliases : StartTokenAliases; - return tokenAliases[alias] == TokenType.Bold - || tokenAliases[alias] == TokenType.Italic; + return tokenAliases[alias] == MdTokenType.Bold + || tokenAliases[alias] == MdTokenType.Italic; } private bool TryMatchPattern(string input, int index, string pattern) From 887266f6221a85df59c2a63ea4057c3affe5d917 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 7 Dec 2024 19:15:12 +0500 Subject: [PATCH 14/26] refactored the whole thing --- .../{IMdTests.cs => MdTests.cs} | 14 +- .../AbstractSyntaxTreeNodeView.cs | 6 + .../AbstractSyntaxTree/IAbstractSyntaxTree.cs | 13 +- .../MdAbstractSyntaxTree.cs | 134 ++++++++++++++ .../MdAbstractSyntaxTree_.cs | 4 +- Markdown/Markdown/{IMd.cs => IRenderer.cs} | 2 +- Markdown/Markdown/Md.cs | 74 +++++--- Markdown/Markdown/NodeView/BaseNodeView.cs | 3 + Markdown/Markdown/NodeView/ViewEnd.cs | 3 + Markdown/Markdown/ParseTree/IParseTree.cs | 11 ++ Markdown/Markdown/ParseTree/MdParseTree.cs | 71 ++++++++ .../Markdown/ParseTree/ParseTreeNodeView.cs | 9 + Markdown/Markdown/Parser/IParser.cs | 8 + Markdown/Markdown/Parser/MdParser.cs | 78 ++++++++ Markdown/Markdown/Parser/MdParser_.cs | 50 +++++ Markdown/Markdown/Program.cs | 2 +- Markdown/Markdown/SyntaxRules/ISyntaxRule.cs | 10 + Markdown/Markdown/Token/MdToken.cs | 3 + Markdown/Markdown/Token/MdTokenBehaviour.cs | 9 + Markdown/Markdown/Token/MdTokenType.cs | 8 +- Markdown/Markdown/Tokenizer/ITokenizer.cs | 7 +- Markdown/Markdown/Tokenizer/MdTokenizer.cs | 172 ++++++++++++++++++ Markdown/Markdown/Tokenizer/MdTokenizer_.cs | 2 +- Markdown/Markdown/Traversable/ITraversable.cs | 6 + 24 files changed, 649 insertions(+), 50 deletions(-) rename Markdown/Markdown.Tests/{IMdTests.cs => MdTests.cs} (93%) create mode 100644 Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs create mode 100644 Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs rename Markdown/Markdown/{IMd.cs => IRenderer.cs} (70%) create mode 100644 Markdown/Markdown/NodeView/BaseNodeView.cs create mode 100644 Markdown/Markdown/NodeView/ViewEnd.cs create mode 100644 Markdown/Markdown/ParseTree/IParseTree.cs create mode 100644 Markdown/Markdown/ParseTree/MdParseTree.cs create mode 100644 Markdown/Markdown/ParseTree/ParseTreeNodeView.cs create mode 100644 Markdown/Markdown/Parser/IParser.cs create mode 100644 Markdown/Markdown/Parser/MdParser.cs create mode 100644 Markdown/Markdown/Parser/MdParser_.cs create mode 100644 Markdown/Markdown/SyntaxRules/ISyntaxRule.cs create mode 100644 Markdown/Markdown/Token/MdToken.cs create mode 100644 Markdown/Markdown/Token/MdTokenBehaviour.cs create mode 100644 Markdown/Markdown/Tokenizer/MdTokenizer.cs create mode 100644 Markdown/Markdown/Traversable/ITraversable.cs diff --git a/Markdown/Markdown.Tests/IMdTests.cs b/Markdown/Markdown.Tests/MdTests.cs similarity index 93% rename from Markdown/Markdown.Tests/IMdTests.cs rename to Markdown/Markdown.Tests/MdTests.cs index 7c8bdf26f..8cf927c1a 100644 --- a/Markdown/Markdown.Tests/IMdTests.cs +++ b/Markdown/Markdown.Tests/MdTests.cs @@ -7,10 +7,10 @@ namespace Markdown.Tests; [TestFixture] -[TestOf(typeof(IMd))] -public class IMdTests +[TestOf(typeof(Md))] +public class MdTests { - private IMd _imd; + private IRenderer _imd; [SetUp] public void SetUp() @@ -111,7 +111,6 @@ public void Render_ReturnsCorrectMarkdown_ForIncorrectlyPlacedUnderscores( .Be(expectedOutput); } - // Это пока не работает [Test] [TestCase("Intersecting _markdown __should_ work__ like this", "Intersecting _markdown __should_ work__ like this")] [TestCase("Another __example of _intersecting__ markdown_", "Another __example of _intersecting__ markdown_")] @@ -136,10 +135,9 @@ public void Render_ReturnsCorrectMarkdown_ForEmptyMarkdown( .Be(expectedOutput); } - // Это пока не работает [Test] - [TestCase("This should \\_not turn\\_ into tags", "This should _not turn into tags")] - [TestCase("This should \\\\remain the\\\\ same", "This should \\\\remain the\\\\ same")] + [TestCase(@"This should \_not turn\_ into tags", "This should _not turn_ into tags")] + [TestCase(@"This should \remain the\ same", @"This should \remain the\ same")] public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( string input, string expectedOutput) @@ -152,7 +150,7 @@ public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( [Test] public void Render_PerformanceTest() { - var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 100000); + var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 50000); Console.WriteLine($"Total length: {fullStr.Length}"); var totalTime = MeasureTime(fullStr); diff --git a/Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs b/Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs new file mode 100644 index 000000000..ec405e791 --- /dev/null +++ b/Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs @@ -0,0 +1,6 @@ +using Markdown.NodeView; + +namespace Markdown.AbstractSyntaxTree; + +public record AbstractSyntaxTreeNodeView(ReadOnlyMemory Text, TTokenType TokenType) + : BaseNodeView; \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs index c84cc8255..3482e1b4f 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/IAbstractSyntaxTree.cs @@ -1,10 +1,11 @@ +using Markdown.NodeView; +using Markdown.SyntaxRules; +using Markdown.Traversable; + namespace Markdown.AbstractSyntaxTree; -public interface IAbstractSyntaxTree -where TTokenType : Enum +public interface IAbstractSyntaxTree : ITraversable> { - public void AddToken(TTokenType tokenType, ReadOnlyMemory tokenValue); - public bool HasTokenInContext(TTokenType tokenType); - public void EndToken(TTokenType tokenType, ReadOnlyMemory? tokenValue = null); - public string ToText(); + public IAbstractSyntaxTree AddRule(ISyntaxRule rule); + public IAbstractSyntaxTree ApplyRules(); } \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs new file mode 100644 index 000000000..70f8230f9 --- /dev/null +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -0,0 +1,134 @@ +using System.Collections.Immutable; +using Markdown.NodeView; +using Markdown.ParseTree; +using Markdown.SyntaxRules; +using Markdown.Token; + +namespace Markdown.AbstractSyntaxTree; + +public class MdAbstractSyntaxTree : IAbstractSyntaxTree +{ + private class Node + { + public Node( + MdTokenType type, + ReadOnlyMemory? text = null, + Node? parent = null) + { + Type = type; + Text = text ?? ReadOnlyMemory.Empty; + Parent = parent; + Children = new List(); + } + + public ReadOnlyMemory Text { get; } + public MdTokenType Type { get; } + public List Children { get; set; } + public Node? Parent { get; set; } + } + + private readonly Node _root; + private Node _current; + + private readonly ImmutableList> _rules; + + private MdAbstractSyntaxTree() + { + _root = new Node(MdTokenType.Document); + _current = _root; + _rules = ImmutableList>.Empty; + } + + private MdAbstractSyntaxTree( + MdAbstractSyntaxTree tree, ISyntaxRule rule) + { + _root = tree._root; + _current = _root; + _rules = tree._rules.Add(rule); + } + + public static MdAbstractSyntaxTree FromParseTree(IParseTree parseTree) + { + var syntaxTree = new MdAbstractSyntaxTree(); + foreach (var baseView in parseTree.Traverse()) + { + if (baseView is ParseTreeNodeView nodeView) + { + if (nodeView.TokenType != MdTokenType.Document) + { + if (nodeView.Complete) + { + var newNode = new Node(nodeView.TokenType, nodeView.Text, syntaxTree._current); + if (nodeView.TokenType != MdTokenType.PlainText) + syntaxTree.AddNode(newNode); + else + syntaxTree._current.Children.Add(newNode); + } + else + { + var newNode = new Node(MdTokenType.PlainText, nodeView.Text, syntaxTree._current); + syntaxTree._current.Children.Add(newNode); + } + } + } + else if (baseView is ViewEnd) + syntaxTree.EndCurrentNode(); + else + throw new InvalidOperationException("Unexpected node type"); + } + return syntaxTree; + } + + private void AddNode(Node node) + { + _current.Children.Add(node); + _current = node; + } + + private void EndCurrentNode() + { + if (_current != _root) + _current = _current.Parent!; + } + + public IEnumerable> Traverse() + { + return Traverse(_root); + } + + private static IEnumerable> Traverse(Node node) + { + yield return new AbstractSyntaxTreeNodeView(node.Text, node.Type); + var childNodes = node.Children.SelectMany(Traverse).ToList(); + foreach (var childNode in childNodes) + yield return childNode; + if (childNodes.Count > 0) + yield return new ViewEnd(node.Type); + } + + public IAbstractSyntaxTree AddRule(ISyntaxRule rule) + { + return new MdAbstractSyntaxTree(this, rule); + } + + public IAbstractSyntaxTree ApplyRules() + { + var newSyntaxTree = new MdAbstractSyntaxTree(); + foreach (var baseView in Traverse()) + { + if (baseView is AbstractSyntaxTreeNodeView nodeView + && nodeView.TokenType != MdTokenType.Document) + { + var parentNodeView = new AbstractSyntaxTreeNodeView( + newSyntaxTree._current.Text, newSyntaxTree._current.Type); + foreach (var rule in _rules) + { + var result = rule.Apply(nodeView, parentNodeView); + + } + } + } + + return newSyntaxTree; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs index 5ff02a7cb..0d56a2f38 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs @@ -4,7 +4,7 @@ namespace Markdown.AbstractSyntaxTree; -public class MdAbstractSyntaxTree +public class MdAbstractSyntaxTree_ { public class Node { @@ -43,7 +43,7 @@ public IEnumerable RemoveChildren() private readonly Node _root; private Node _current; - public MdAbstractSyntaxTree(ReadOnlyDictionary tokenTags) + public MdAbstractSyntaxTree_(ReadOnlyDictionary tokenTags) { _tokenTags = tokenTags; _root = new Node(); diff --git a/Markdown/Markdown/IMd.cs b/Markdown/Markdown/IRenderer.cs similarity index 70% rename from Markdown/Markdown/IMd.cs rename to Markdown/Markdown/IRenderer.cs index c77626d39..0314b9797 100644 --- a/Markdown/Markdown/IMd.cs +++ b/Markdown/Markdown/IRenderer.cs @@ -1,6 +1,6 @@ namespace Markdown; -public interface IMd +public interface IRenderer { public string Render(string input); } \ No newline at end of file diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs index 9bad90d2e..0684deac1 100644 --- a/Markdown/Markdown/Md.cs +++ b/Markdown/Markdown/Md.cs @@ -1,40 +1,66 @@ -using System.Collections.ObjectModel; +using System.Text; using Markdown.AbstractSyntaxTree; +using Markdown.NodeView; +using Markdown.Parser; +using Markdown.ParseTree; +using Markdown.Token; using Markdown.Tokenizer; namespace Markdown; -public class Md : IMd +public class Md : IRenderer { - private readonly ReadOnlyDictionary _tokenAliases; - private readonly ReadOnlyDictionary _endTokenAliases; - private readonly ReadOnlyDictionary _tokenTags; + private readonly Dictionary _tokenTags; + + private readonly MdTokenizer _tokenizer; + private readonly MdParser _parser; public Md() { - var tokenAliases = new Dictionary(); - tokenAliases.Add("_", TokenType.Italic); - tokenAliases.Add("__", TokenType.Bold); - tokenAliases.Add("# ", TokenType.Heading); - _tokenAliases = tokenAliases.AsReadOnly(); - - var endTokenAliases = new Dictionary(); - endTokenAliases.Add("_", TokenType.Italic); - endTokenAliases.Add("__", TokenType.Bold); - endTokenAliases.Add("\n", TokenType.Heading); - _endTokenAliases = endTokenAliases.AsReadOnly(); + var tokenAliases = new Dictionary(); + tokenAliases.Add("_", MdTokenType.Italic); + tokenAliases.Add("__", MdTokenType.Bold); + tokenAliases.Add("# ", MdTokenType.Heading); + tokenAliases.Add("\n", MdTokenType.Line); - var tokenTags = new Dictionary(); - tokenTags.Add(TokenType.Italic, "em"); - tokenTags.Add(TokenType.Bold, "strong"); - tokenTags.Add(TokenType.Heading, "h1"); - _tokenTags = tokenTags.AsReadOnly(); + _tokenTags = new Dictionary(); + _tokenTags.Add(MdTokenType.Italic, "em"); + _tokenTags.Add(MdTokenType.Bold, "strong"); + _tokenTags.Add(MdTokenType.Heading, "h1"); + + _tokenizer = new MdTokenizer(tokenAliases, '\\'); + _parser = new MdParser(new MdParseTree()); } public string Render(string input) { - var tokenizer = new MdTokenizer(_tokenAliases, _endTokenAliases); - var syntaxTree = tokenizer.Tokenize(new MdAbstractSyntaxTree(_tokenTags), input.AsMemory()); - return syntaxTree.ToText(); + var tokens = _tokenizer.Tokenize(input.AsMemory()); + var parseTree = _parser.Parse(tokens); + var syntaxTree = MdAbstractSyntaxTree.FromParseTree(parseTree); + return syntaxTree + .Traverse() + .Aggregate(new StringBuilder(), + (sb, node) => ProcessNode(node, sb)) + .ToString(); + } + + private StringBuilder ProcessNode(BaseNodeView? node, StringBuilder sb) + { + if (node is AbstractSyntaxTreeNodeView nodeView) + { + if (nodeView.TokenType is MdTokenType.PlainText or MdTokenType.Document or MdTokenType.Line) + sb.Append(nodeView.Text); + else + sb.Append($"<{_tokenTags[nodeView.TokenType]}>"); + } + else if (node is ViewEnd viewEnd) + { + if (viewEnd.TokenType is not (MdTokenType.PlainText or MdTokenType.Document or MdTokenType.Line)) + { + sb.Append($""); + } + } + + return sb; } } \ No newline at end of file diff --git a/Markdown/Markdown/NodeView/BaseNodeView.cs b/Markdown/Markdown/NodeView/BaseNodeView.cs new file mode 100644 index 000000000..b8464af85 --- /dev/null +++ b/Markdown/Markdown/NodeView/BaseNodeView.cs @@ -0,0 +1,3 @@ +namespace Markdown.NodeView; + +public record BaseNodeView(); \ No newline at end of file diff --git a/Markdown/Markdown/NodeView/ViewEnd.cs b/Markdown/Markdown/NodeView/ViewEnd.cs new file mode 100644 index 000000000..154680f22 --- /dev/null +++ b/Markdown/Markdown/NodeView/ViewEnd.cs @@ -0,0 +1,3 @@ +namespace Markdown.NodeView; + +public record ViewEnd(TTokenType TokenType) : BaseNodeView; \ No newline at end of file diff --git a/Markdown/Markdown/ParseTree/IParseTree.cs b/Markdown/Markdown/ParseTree/IParseTree.cs new file mode 100644 index 000000000..dab54aecc --- /dev/null +++ b/Markdown/Markdown/ParseTree/IParseTree.cs @@ -0,0 +1,11 @@ +using Markdown.NodeView; +using Markdown.Traversable; + +namespace Markdown.ParseTree; + +public interface IParseTree : ITraversable> +{ + public ParseTreeNodeView CurrentToken { get; } + public void OpenToken(TTokenType tokenType, ReadOnlyMemory text); + public void CloseCurrentToken(bool complete); +} \ No newline at end of file diff --git a/Markdown/Markdown/ParseTree/MdParseTree.cs b/Markdown/Markdown/ParseTree/MdParseTree.cs new file mode 100644 index 000000000..0b424d4ab --- /dev/null +++ b/Markdown/Markdown/ParseTree/MdParseTree.cs @@ -0,0 +1,71 @@ +using Markdown.NodeView; +using Markdown.Token; + +namespace Markdown.ParseTree; + +public class MdParseTree : IParseTree +{ + private class Node + { + public Node(MdTokenType type, + bool complete = false, + ReadOnlyMemory? text = null, + Node? parent = null + ) + { + Type = type; + Children = new List(); + Complete = complete; + Text = text ?? ReadOnlyMemory.Empty; + Parent = parent; + } + + public ReadOnlyMemory Text { get; set; } + public MdTokenType Type { get; set; } + public bool Complete { get; set; } + public List Children { get; set; } + public Node? Parent { get; set; } + } + + public MdParseTree() + { + _root = new Node(MdTokenType.Document, true); + _current = _root; + } + + private readonly Node _root; + private Node _current; + + public ParseTreeNodeView CurrentToken => + new(_current.Text, _current.Type, _current.Children.Count == 0, _current.Complete); + + public void OpenToken(MdTokenType tokenType, ReadOnlyMemory text) + { + var newNode = new Node(tokenType, false, text, _current); + _current.Children.Add(newNode); + _current = newNode; + } + + public void CloseCurrentToken(bool complete) + { + if (_current == _root) + throw new InvalidOperationException("Cannot call CloseCurrentToken when on root node"); + _current.Complete = complete; + _current = _current.Parent!; + } + + public IEnumerable> Traverse() + { + return Traverse(_root); + } + + private static IEnumerable> Traverse(Node node) + { + yield return new ParseTreeNodeView(node.Text, node.Type, node.Children.Count == 0, node.Complete); + var childNodes = node.Children.SelectMany(Traverse).ToList(); + foreach (var childNode in childNodes) + yield return childNode; + if (childNodes.Count > 0) + yield return new ViewEnd(node.Type); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/ParseTree/ParseTreeNodeView.cs b/Markdown/Markdown/ParseTree/ParseTreeNodeView.cs new file mode 100644 index 000000000..061370887 --- /dev/null +++ b/Markdown/Markdown/ParseTree/ParseTreeNodeView.cs @@ -0,0 +1,9 @@ +using Markdown.NodeView; + +namespace Markdown.ParseTree; + +public record ParseTreeNodeView( + ReadOnlyMemory Text, + TTokenType TokenType, + bool Empty, + bool Complete) : BaseNodeView; \ No newline at end of file diff --git a/Markdown/Markdown/Parser/IParser.cs b/Markdown/Markdown/Parser/IParser.cs new file mode 100644 index 000000000..fba975904 --- /dev/null +++ b/Markdown/Markdown/Parser/IParser.cs @@ -0,0 +1,8 @@ +using Markdown.ParseTree; + +namespace Markdown.Parser; + +public interface IParser +{ + public IParseTree Parse(IEnumerable tokens); +} \ No newline at end of file diff --git a/Markdown/Markdown/Parser/MdParser.cs b/Markdown/Markdown/Parser/MdParser.cs new file mode 100644 index 000000000..2e5cfb0e4 --- /dev/null +++ b/Markdown/Markdown/Parser/MdParser.cs @@ -0,0 +1,78 @@ +using Markdown.ParseTree; +using Markdown.Token; + +namespace Markdown.Parser; + +public class MdParser(IParseTree parseTree) : IParser +{ + public IParseTree Parse(IEnumerable tokens) + { + parseTree.OpenToken(MdTokenType.Line, ReadOnlyMemory.Empty); + foreach (var token in tokens) + { + if (token.Type == MdTokenType.Heading) + { + if (parseTree.CurrentToken is { TokenType: MdTokenType.Line, Empty: true }) + { + parseTree.OpenToken(token.Type, token.Text); + } + else + { + parseTree.OpenToken(token.Type, token.Text); + parseTree.CloseCurrentToken(false); + } + } + else if (token.Type == MdTokenType.Line) + { + while (parseTree.CurrentToken.TokenType != MdTokenType.Document) + parseTree.CloseCurrentToken( + parseTree.CurrentToken.TokenType is MdTokenType.Heading or MdTokenType.Line); + parseTree.OpenToken(token.Type, token.Text); + } + else if (token.Behaviour == MdTokenBehaviour.Opening + && parseTree.CurrentToken.TokenType != token.Type) + { + parseTree.OpenToken(token.Type, token.Text); + } + else if (token.Behaviour == MdTokenBehaviour.Closing + && parseTree.CurrentToken.TokenType == token.Type) + { + if (parseTree.CurrentToken.Empty) + { + parseTree.CloseCurrentToken(false); + parseTree.OpenToken(token.Type, token.Text); + parseTree.CloseCurrentToken(false); + } + else + parseTree.CloseCurrentToken(true); + } + else if (token.Type == MdTokenType.PlainText) + { + parseTree.OpenToken(token.Type, token.Text); + parseTree.CloseCurrentToken(true); + } + else if (token.Behaviour == MdTokenBehaviour.InsideAWord) + { + if (parseTree.CurrentToken.TokenType == token.Type) + parseTree.CloseCurrentToken(true); + else + parseTree.OpenToken(token.Type, token.Text); + } + else + { + if (token.Behaviour == MdTokenBehaviour.Closing + && parseTree.CurrentToken.TokenType != MdTokenType.Document + && parseTree.CurrentToken.TokenType != MdTokenType.Line) + parseTree.CloseCurrentToken(false); + parseTree.OpenToken(token.Type, token.Text); + parseTree.CloseCurrentToken(false); + } + } + + while (parseTree.CurrentToken.TokenType != MdTokenType.Document) + parseTree.CloseCurrentToken( + parseTree.CurrentToken.TokenType is MdTokenType.Heading or MdTokenType.Line); + + return parseTree; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Parser/MdParser_.cs b/Markdown/Markdown/Parser/MdParser_.cs new file mode 100644 index 000000000..8affc1eab --- /dev/null +++ b/Markdown/Markdown/Parser/MdParser_.cs @@ -0,0 +1,50 @@ +using Markdown.AbstractSyntaxTree; +using Markdown.Token; + +namespace Markdown.Parser; + +public class MdParser_ +{ + public MdAbstractSyntaxTree_ Parse(IEnumerable tokens, MdAbstractSyntaxTree_ syntaxTree) + { + foreach (var token in tokens) + { + if (token.Behaviour == MdTokenBehaviour.Closing && syntaxTree.HasTokenInContext(token.Type)) + { + if (syntaxTree.GetChildrenForCurrentToken().Count == 0) + { + syntaxTree.EndToken(token.Type); + syntaxTree.AddToken(MdTokenType.PlainText, token.Text); + } + else + syntaxTree.EndToken(token.Type); + } + else if (token.Behaviour == MdTokenBehaviour.Opening && !syntaxTree.HasTokenInContext(token.Type)) + { + syntaxTree.AddToken(token.Type, token.Text); + } + else if (token.Behaviour == MdTokenBehaviour.Closing && !syntaxTree.HasTokenInContext(token.Type)) + { + // TODO intersecting markdown + syntaxTree.EndToken(); + syntaxTree.AddToken(MdTokenType.PlainText, token.Text); + } + else if (token.Behaviour == MdTokenBehaviour.Opening && syntaxTree.HasTokenInContext(token.Type)) + { + syntaxTree.AddToken(MdTokenType.PlainText, token.Text); + } + else if (token.Type == MdTokenType.PlainText || token.Behaviour == MdTokenBehaviour.Undefined) + { + syntaxTree.AddToken(MdTokenType.PlainText, token.Text); + } + else + { + throw new ArgumentException( + $"Unexpected token of type {token.Type}, with behaviour {token.Behaviour}, " + + $"with text: {token.Text}"); + } + } + + return syntaxTree; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Program.cs b/Markdown/Markdown/Program.cs index 21b7b9395..835324e1d 100644 --- a/Markdown/Markdown/Program.cs +++ b/Markdown/Markdown/Program.cs @@ -1,4 +1,4 @@ using Markdown; var md = new Md(); -Console.WriteLine(md.Render("# Hello World! _some words_ in italics\n __some other text__")); \ No newline at end of file +Console.WriteLine(md.Render("# Hello World! _some words_ in italics\n__some other text__")); \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs b/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs new file mode 100644 index 000000000..ecff61679 --- /dev/null +++ b/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs @@ -0,0 +1,10 @@ +using Markdown.AbstractSyntaxTree; + +namespace Markdown.SyntaxRules; + +public interface ISyntaxRule +{ + public AbstractSyntaxTreeNodeView Apply( + AbstractSyntaxTreeNodeView node, + AbstractSyntaxTreeNodeView parentNode); +} \ No newline at end of file diff --git a/Markdown/Markdown/Token/MdToken.cs b/Markdown/Markdown/Token/MdToken.cs new file mode 100644 index 000000000..2c557cd27 --- /dev/null +++ b/Markdown/Markdown/Token/MdToken.cs @@ -0,0 +1,3 @@ +namespace Markdown.Token; + +public record MdToken(MdTokenType Type, MdTokenBehaviour Behaviour, ReadOnlyMemory Text); \ No newline at end of file diff --git a/Markdown/Markdown/Token/MdTokenBehaviour.cs b/Markdown/Markdown/Token/MdTokenBehaviour.cs new file mode 100644 index 000000000..77c85783b --- /dev/null +++ b/Markdown/Markdown/Token/MdTokenBehaviour.cs @@ -0,0 +1,9 @@ +namespace Markdown.Token; + +public enum MdTokenBehaviour +{ + Opening, + Closing, + InsideAWord, + Undefined, +} \ No newline at end of file diff --git a/Markdown/Markdown/Token/MdTokenType.cs b/Markdown/Markdown/Token/MdTokenType.cs index 39c6cbddd..2a1af971b 100644 --- a/Markdown/Markdown/Token/MdTokenType.cs +++ b/Markdown/Markdown/Token/MdTokenType.cs @@ -1,9 +1,13 @@ -namespace Markdown; +namespace Markdown.Token; public enum MdTokenType { PlainText, + Document, + Line, Italic, Bold, - Heading + Heading, + UnorderedList, + UnorderedListItem, } \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/ITokenizer.cs b/Markdown/Markdown/Tokenizer/ITokenizer.cs index 721ac5893..4f72a6ea0 100644 --- a/Markdown/Markdown/Tokenizer/ITokenizer.cs +++ b/Markdown/Markdown/Tokenizer/ITokenizer.cs @@ -1,9 +1,6 @@ -using Markdown.AbstractSyntaxTree; - namespace Markdown.Tokenizer; -public interface ITokenizer -where TTokenType : Enum +public interface ITokenizer { - public IAbstractSyntaxTree Tokenize(IAbstractSyntaxTree tree, ReadOnlyMemory input); + public IEnumerable Tokenize(ReadOnlyMemory input); } \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer.cs b/Markdown/Markdown/Tokenizer/MdTokenizer.cs new file mode 100644 index 000000000..7eff4a465 --- /dev/null +++ b/Markdown/Markdown/Tokenizer/MdTokenizer.cs @@ -0,0 +1,172 @@ +using System.Runtime.InteropServices; +using Markdown.Token; + +namespace Markdown.Tokenizer; + +public class MdTokenizer(Dictionary tokenAliases, char escapeCharacter) : ITokenizer +{ + public IEnumerable Tokenize(ReadOnlyMemory input) + { + ArgumentExceptionHelpers.ThrowIfFalse( + MemoryMarshal.TryGetString(input, out var str, out var start, out var length), + "Underlying object in the input argument is not a string"); + + var foundPlainText = false; + var plainTextStart = 0; + var increment = 1; + for (var i = start; i < length; ) + { + if (escapeCharacter == str![i] && i + 1 < str.Length) + { + if (TryMatchTokenAliases(str, i + 1, out _, out _, out _)) + { + increment = 2; + yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, + input.Slice(plainTextStart, i - plainTextStart)); + yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, input.Slice(i + 1, 1)); + foundPlainText = false; + } + } + else if (TryMatchTokenAliases(str, i, out var tokenType, out var tokenAlias, out var tokenBehaviour)) + { + increment = tokenAlias.Length; + + if (foundPlainText) + yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, + input.Slice(plainTextStart, i - plainTextStart)); + + yield return new MdToken(tokenType, tokenBehaviour, input.Slice(i, tokenAlias.Length)); + + foundPlainText = false; + } + else + { + if (!foundPlainText) + plainTextStart = i; + foundPlainText = true; + } + + i += increment; + if (increment > 1) + increment = 1; + } + + if (foundPlainText) + yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, + input.Slice(plainTextStart, str!.Length - plainTextStart)); + } + + private bool TryMatchTokenAliases( + string input, + int index, + out MdTokenType mdTokenType, + out string tokenAlias, + out MdTokenBehaviour tokenBehaviour) + { + var matchedClosingToken = false; + var mathcedOpeningToken = false; + + var openingTokenType = default(MdTokenType); + var closingTokenType = default(MdTokenType); + + if (TryMatchTokenAliases(input, index, true, out var closingTokenAlias)) + matchedClosingToken = tokenAliases.TryGetValue(closingTokenAlias, out closingTokenType); + + if (TryMatchTokenAliases(input, index, false, out var openingTokenAlias)) + mathcedOpeningToken = tokenAliases.TryGetValue(openingTokenAlias, out openingTokenType); + + if (mathcedOpeningToken && matchedClosingToken) + { + var (alias, type, behaviour) = openingTokenAlias.Length > closingTokenAlias.Length + ? (openingTokenAlias, openingTokenType, MdTokenBehaviour.Opening) + : (closingTokenAlias, closingTokenType, MdTokenBehaviour.Closing); + + if (IsInsideAWord(input, index, alias)) + tokenBehaviour = MdTokenBehaviour.InsideAWord; + else + tokenBehaviour = behaviour; + tokenAlias = alias; + mdTokenType = type; + return true; + } + + if (mathcedOpeningToken) + { + tokenBehaviour = MdTokenBehaviour.Opening; + tokenAlias = openingTokenAlias; + mdTokenType = openingTokenType; + return true; + } + + if (matchedClosingToken) + { + tokenBehaviour = MdTokenBehaviour.Closing; + tokenAlias = closingTokenAlias; + mdTokenType = closingTokenType; + return true; + } + + tokenBehaviour = MdTokenBehaviour.Undefined; + tokenAlias = String.Empty; + mdTokenType = MdTokenType.PlainText; + return false; + } + + private bool TryMatchTokenAliases( + string input, + int index, + bool endToken, + out string tokenAlias) + { + tokenAlias = String.Empty; + foreach (var alias in tokenAliases.Keys) + { + if (TryMatchAlias(input, index, alias, endToken) && tokenAlias.Length < alias.Length) + tokenAlias = alias; + } + + return !String.IsNullOrEmpty(tokenAlias); + } + + private bool TryMatchAlias(string input, int index, string alias, bool isEndToken) + { + return TryMatchPattern(input, index, alias) + && HasANonDelimiterCharacterNearIt(input, index, alias, isEndToken); + } + + private bool IsWordDelimiter(char c) + { + return c is ' ' or '\t' or '\n' or '\r' or ',' or '.' + or '!' or '?'; + } + + private bool IsInsideAWord(string input, int index, string alias) + { + var trueForLeftEdge = index - 1 >= 0 && !IsWordDelimiter(input[index - 1]); + var trueForRightEdge = index + alias.Length < input.Length + && !IsWordDelimiter(input[index + alias.Length]); + return trueForLeftEdge && trueForRightEdge; + } + + private bool HasANonDelimiterCharacterNearIt(string input, int index, string alias, bool closingToken) + { + var trueForOpening = index + alias.Length < input.Length + && !IsWordDelimiter(input[index + alias.Length]); + var trueForClosing = index - 1 >= 0 + && !IsWordDelimiter(input[index - 1]); + return closingToken ? trueForClosing : trueForOpening; + } + + private bool TryMatchPattern(string input, int index, string pattern) + { + int i = 0; + foreach (var ch in pattern) + { + if (index + i >= input.Length || ch != input[index + i]) + return false; + i++; + } + + return true; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer_.cs b/Markdown/Markdown/Tokenizer/MdTokenizer_.cs index 7a56ed175..fbe1b1927 100644 --- a/Markdown/Markdown/Tokenizer/MdTokenizer_.cs +++ b/Markdown/Markdown/Tokenizer/MdTokenizer_.cs @@ -4,7 +4,7 @@ namespace Markdown.Tokenizer; -public class MdTokenizer( +public class MdTokenizer_( ReadOnlyDictionary StartTokenAliases, ReadOnlyDictionary EndTokenAliases ) diff --git a/Markdown/Markdown/Traversable/ITraversable.cs b/Markdown/Markdown/Traversable/ITraversable.cs new file mode 100644 index 000000000..f7730f071 --- /dev/null +++ b/Markdown/Markdown/Traversable/ITraversable.cs @@ -0,0 +1,6 @@ +namespace Markdown.Traversable; + +public interface ITraversable +{ + public IEnumerable Traverse(); +} \ No newline at end of file From 5be7d4614d18fb92477f39d7adac7485abdc635e Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 7 Dec 2024 21:36:01 +0500 Subject: [PATCH 15/26] added nesting syntax rule --- Markdown/Markdown.Tests/MdTests.cs | 42 +++++++++---- .../MdAbstractSyntaxTree.cs | 61 ++++++++++++++----- Markdown/Markdown/Md.cs | 43 +++++-------- Markdown/Markdown/Program.cs | 17 +++++- Markdown/Markdown/SyntaxRules/ISyntaxRule.cs | 6 +- Markdown/Markdown/SyntaxRules/NestingRule.cs | 19 ++++++ 6 files changed, 128 insertions(+), 60 deletions(-) create mode 100644 Markdown/Markdown/SyntaxRules/NestingRule.cs diff --git a/Markdown/Markdown.Tests/MdTests.cs b/Markdown/Markdown.Tests/MdTests.cs index 8cf927c1a..23d9684fe 100644 --- a/Markdown/Markdown.Tests/MdTests.cs +++ b/Markdown/Markdown.Tests/MdTests.cs @@ -1,7 +1,12 @@ using System; +using System.Collections.Generic; using System.Diagnostics; using System.Text; using FluentAssertions; +using Markdown.Parser; +using Markdown.ParseTree; +using Markdown.Token; +using Markdown.Tokenizer; using NUnit.Framework; namespace Markdown.Tests; @@ -10,12 +15,23 @@ namespace Markdown.Tests; [TestOf(typeof(Md))] public class MdTests { - private IRenderer _imd; + private IRenderer _md; [SetUp] public void SetUp() { - _imd = new Md(); + var tokenAliases = new Dictionary(); + tokenAliases.Add("_", MdTokenType.Italic); + tokenAliases.Add("__", MdTokenType.Bold); + tokenAliases.Add("# ", MdTokenType.Heading); + tokenAliases.Add("\n", MdTokenType.Line); + + var tokenTags = new Dictionary(); + tokenTags.Add(MdTokenType.Italic, "em"); + tokenTags.Add(MdTokenType.Bold, "strong"); + tokenTags.Add(MdTokenType.Heading, "h1"); + + _md = new Md(tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree())); } [Test] @@ -27,7 +43,7 @@ public void Render_ReturnsCorrectMarkdown_ForSimpleCases( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -39,7 +55,7 @@ public void Render_ReturnsCorrectMarkdown_ForCasesWithNesting( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -53,7 +69,7 @@ public void Render_ReturnsCorrectMarkdown_ForTextWithNumbers( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -69,7 +85,7 @@ public void Render_ReturnsCorrectMarkdown_ForPartsOfWords( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -82,7 +98,7 @@ public void Render_ReturnsCorrectMarkdown_ForMarkdownInDifferentWords( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -94,7 +110,7 @@ public void Render_ReturnsCorrectMarkdown_ForUnpairedMarkdownSymbols( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -106,7 +122,7 @@ public void Render_ReturnsCorrectMarkdown_ForIncorrectlyPlacedUnderscores( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -118,7 +134,7 @@ public void Render_ReturnsCorrectMarkdown_ForIntersectingMarkdown( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -130,7 +146,7 @@ public void Render_ReturnsCorrectMarkdown_ForEmptyMarkdown( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -142,7 +158,7 @@ public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( string input, string expectedOutput) { - _imd.Render(input) + _md.Render(input) .Should() .Be(expectedOutput); } @@ -165,7 +181,7 @@ private long MeasureTime(string fullStr) { var sw = new Stopwatch(); sw.Start(); - _imd.Render(fullStr); + _md.Render(fullStr); sw.Stop(); return sw.ElapsedMilliseconds; } diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs index 70f8230f9..3dd52d58f 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -20,6 +20,13 @@ public Node( Parent = parent; Children = new List(); } + + public Node(AbstractSyntaxTreeNodeView nodeView) + { + Type = nodeView.TokenType; + Text = nodeView.Text; + Children = new List(); + } public ReadOnlyMemory Text { get; } public MdTokenType Type { get; } @@ -40,7 +47,8 @@ private MdAbstractSyntaxTree() } private MdAbstractSyntaxTree( - MdAbstractSyntaxTree tree, ISyntaxRule rule) + MdAbstractSyntaxTree tree, + ISyntaxRule rule) { _root = tree._root; _current = _root; @@ -114,21 +122,44 @@ public IAbstractSyntaxTree AddRule(ISyntaxRule rule) public IAbstractSyntaxTree ApplyRules() { var newSyntaxTree = new MdAbstractSyntaxTree(); - foreach (var baseView in Traverse()) + ProcessChildNodes(_root, newSyntaxTree); + return newSyntaxTree; + } + + private void ProcessChildNodes(Node node, MdAbstractSyntaxTree newSyntaxTree) + { + var parentNodeView = new AbstractSyntaxTreeNodeView(node.Text, node.Type); + + for (var i = 0; i < node.Children.Count; i++) { - if (baseView is AbstractSyntaxTreeNodeView nodeView - && nodeView.TokenType != MdTokenType.Document) - { - var parentNodeView = new AbstractSyntaxTreeNodeView( - newSyntaxTree._current.Text, newSyntaxTree._current.Type); - foreach (var rule in _rules) - { - var result = rule.Apply(nodeView, parentNodeView); - - } - } + var childNode = node.Children[i]; + var childNodeView = new AbstractSyntaxTreeNodeView(childNode.Text, childNode.Type); + var leftNeighbourView = i > 0 + ? new AbstractSyntaxTreeNodeView(node.Children[i - 1].Text, node.Children[i - 1].Type) + : null; + var rightNeighbourView = i < node.Children.Count - 1 + ? new AbstractSyntaxTreeNodeView(node.Children[i + 1].Text, node.Children[i + 1].Type) + : null; + var result = childNodeView; + var shouldCopy = false; + foreach (var rule in _rules) + (result, shouldCopy) = rule.Apply(result, parentNodeView, leftNeighbourView, rightNeighbourView); + + var newNode = new Node(result); + newSyntaxTree._current.Children.Add(newNode); + newNode.Parent = newSyntaxTree._current; + + if (newNode.Type != MdTokenType.PlainText && newNode.Type != MdTokenType.Document) + newSyntaxTree._current = newNode; + + if (childNode.Children.Count > 0) + ProcessChildNodes(childNode, newSyntaxTree); + + if (newNode.Parent.Type != MdTokenType.PlainText && newNode.Parent.Type != MdTokenType.Document) + newSyntaxTree._current = newNode.Parent; + + if (shouldCopy) + newSyntaxTree._current.Children.Add(newNode); } - - return newSyntaxTree; } } \ No newline at end of file diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs index 0684deac1..8509cb1a6 100644 --- a/Markdown/Markdown/Md.cs +++ b/Markdown/Markdown/Md.cs @@ -2,41 +2,26 @@ using Markdown.AbstractSyntaxTree; using Markdown.NodeView; using Markdown.Parser; -using Markdown.ParseTree; +using Markdown.SyntaxRules; using Markdown.Token; using Markdown.Tokenizer; namespace Markdown; -public class Md : IRenderer +public class Md( + Dictionary tokenTags, + ITokenizer tokenizer, + IParser parser) : IRenderer { - private readonly Dictionary _tokenTags; - - private readonly MdTokenizer _tokenizer; - private readonly MdParser _parser; - - public Md() - { - var tokenAliases = new Dictionary(); - tokenAliases.Add("_", MdTokenType.Italic); - tokenAliases.Add("__", MdTokenType.Bold); - tokenAliases.Add("# ", MdTokenType.Heading); - tokenAliases.Add("\n", MdTokenType.Line); - - _tokenTags = new Dictionary(); - _tokenTags.Add(MdTokenType.Italic, "em"); - _tokenTags.Add(MdTokenType.Bold, "strong"); - _tokenTags.Add(MdTokenType.Heading, "h1"); - - _tokenizer = new MdTokenizer(tokenAliases, '\\'); - _parser = new MdParser(new MdParseTree()); - } - public string Render(string input) { - var tokens = _tokenizer.Tokenize(input.AsMemory()); - var parseTree = _parser.Parse(tokens); - var syntaxTree = MdAbstractSyntaxTree.FromParseTree(parseTree); + var tokens = tokenizer.Tokenize(input.AsMemory()); + var parseTree = parser.Parse(tokens); + var syntaxTree = MdAbstractSyntaxTree + .FromParseTree(parseTree) + .AddRule(new NestingRule()) + .ApplyRules(); + return syntaxTree .Traverse() .Aggregate(new StringBuilder(), @@ -51,13 +36,13 @@ private StringBuilder ProcessNode(BaseNodeView? node, StringBuilder if (nodeView.TokenType is MdTokenType.PlainText or MdTokenType.Document or MdTokenType.Line) sb.Append(nodeView.Text); else - sb.Append($"<{_tokenTags[nodeView.TokenType]}>"); + sb.Append($"<{tokenTags[nodeView.TokenType]}>"); } else if (node is ViewEnd viewEnd) { if (viewEnd.TokenType is not (MdTokenType.PlainText or MdTokenType.Document or MdTokenType.Line)) { - sb.Append($""); + sb.Append($""); } } diff --git a/Markdown/Markdown/Program.cs b/Markdown/Markdown/Program.cs index 835324e1d..d70a86f7a 100644 --- a/Markdown/Markdown/Program.cs +++ b/Markdown/Markdown/Program.cs @@ -1,4 +1,19 @@ using Markdown; +using Markdown.Parser; +using Markdown.ParseTree; +using Markdown.Token; +using Markdown.Tokenizer; -var md = new Md(); +var tokenAliases = new Dictionary(); +tokenAliases.Add("_", MdTokenType.Italic); +tokenAliases.Add("__", MdTokenType.Bold); +tokenAliases.Add("# ", MdTokenType.Heading); +tokenAliases.Add("\n", MdTokenType.Line); + +var tokenTags = new Dictionary(); +tokenTags.Add(MdTokenType.Italic, "em"); +tokenTags.Add(MdTokenType.Bold, "strong"); +tokenTags.Add(MdTokenType.Heading, "h1"); + +var md = new Md(tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree())); Console.WriteLine(md.Render("# Hello World! _some words_ in italics\n__some other text__")); \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs b/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs index ecff61679..1b620a143 100644 --- a/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs +++ b/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs @@ -4,7 +4,9 @@ namespace Markdown.SyntaxRules; public interface ISyntaxRule { - public AbstractSyntaxTreeNodeView Apply( + public (AbstractSyntaxTreeNodeView result, bool shouldCopy) Apply( AbstractSyntaxTreeNodeView node, - AbstractSyntaxTreeNodeView parentNode); + AbstractSyntaxTreeNodeView parentNode, + AbstractSyntaxTreeNodeView? leftNeighbour, + AbstractSyntaxTreeNodeView? rightNeighbour); } \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/NestingRule.cs b/Markdown/Markdown/SyntaxRules/NestingRule.cs new file mode 100644 index 000000000..b74e3fdb1 --- /dev/null +++ b/Markdown/Markdown/SyntaxRules/NestingRule.cs @@ -0,0 +1,19 @@ +using Markdown.AbstractSyntaxTree; +using Markdown.Token; + +namespace Markdown.SyntaxRules; + +public class NestingRule : ISyntaxRule +{ + public (AbstractSyntaxTreeNodeView result, bool shouldCopy) Apply( + AbstractSyntaxTreeNodeView node, + AbstractSyntaxTreeNodeView parentNode, + AbstractSyntaxTreeNodeView? leftNeighbour, + AbstractSyntaxTreeNodeView? rightNeighbour) + { + if (node.TokenType == MdTokenType.Bold && parentNode.TokenType == MdTokenType.Italic) + return (new AbstractSyntaxTreeNodeView(node.Text, MdTokenType.PlainText), true); + + return (node, false); + } +} \ No newline at end of file From 90e83aa4ea331d57408b3ca78780cca285447a04 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sun, 8 Dec 2024 13:27:56 +0500 Subject: [PATCH 16/26] refactored nesting rule --- .../AbstractSyntaxTreeNodeView.cs | 4 +- .../MdAbstractSyntaxTree.cs | 94 +++++-------------- Markdown/Markdown/NodeView/INodeView.cs | 10 ++ Markdown/Markdown/ParseTree/IParseTree.cs | 2 +- Markdown/Markdown/ParseTree/MdParseTree.cs | 12 ++- .../Markdown/ParseTree/ParseTreeNodeView.cs | 3 +- Markdown/Markdown/Parser/MdParser.cs | 2 +- Markdown/Markdown/SyntaxRules/ISyntaxRule.cs | 7 +- Markdown/Markdown/SyntaxRules/NestingRule.cs | 30 ++++-- 9 files changed, 73 insertions(+), 91 deletions(-) create mode 100644 Markdown/Markdown/NodeView/INodeView.cs diff --git a/Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs b/Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs index ec405e791..d39c4a2ad 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/AbstractSyntaxTreeNodeView.cs @@ -2,5 +2,7 @@ namespace Markdown.AbstractSyntaxTree; -public record AbstractSyntaxTreeNodeView(ReadOnlyMemory Text, TTokenType TokenType) +public record AbstractSyntaxTreeNodeView( + ReadOnlyMemory Text, + TTokenType TokenType) : BaseNodeView; \ No newline at end of file diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs index 3dd52d58f..78c8c87d5 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -8,36 +8,32 @@ namespace Markdown.AbstractSyntaxTree; public class MdAbstractSyntaxTree : IAbstractSyntaxTree { - private class Node + private class Node : INodeView { public Node( MdTokenType type, ReadOnlyMemory? text = null, - Node? parent = null) + Node? parent = null, + bool insideWord = false) { Type = type; Text = text ?? ReadOnlyMemory.Empty; Parent = parent; - Children = new List(); - } - - public Node(AbstractSyntaxTreeNodeView nodeView) - { - Type = nodeView.TokenType; - Text = nodeView.Text; - Children = new List(); + Children = new List>(); + InsideWord = insideWord; } - public ReadOnlyMemory Text { get; } - public MdTokenType Type { get; } - public List Children { get; set; } - public Node? Parent { get; set; } + public ReadOnlyMemory Text { get; set; } + public MdTokenType Type { get; set; } + public bool InsideWord { get; set; } + public List> Children { get; set; } + public INodeView? Parent { get; set; } } - private readonly Node _root; + private Node _root; private Node _current; - private readonly ImmutableList> _rules; + private ImmutableList> _rules; private MdAbstractSyntaxTree() { @@ -45,15 +41,6 @@ private MdAbstractSyntaxTree() _current = _root; _rules = ImmutableList>.Empty; } - - private MdAbstractSyntaxTree( - MdAbstractSyntaxTree tree, - ISyntaxRule rule) - { - _root = tree._root; - _current = _root; - _rules = tree._rules.Add(rule); - } public static MdAbstractSyntaxTree FromParseTree(IParseTree parseTree) { @@ -66,7 +53,8 @@ public static MdAbstractSyntaxTree FromParseTree(IParseTree parseTr { if (nodeView.Complete) { - var newNode = new Node(nodeView.TokenType, nodeView.Text, syntaxTree._current); + var newNode = new Node( + nodeView.TokenType, nodeView.Text, syntaxTree._current, nodeView.insideWord); if (nodeView.TokenType != MdTokenType.PlainText) syntaxTree.AddNode(newNode); else @@ -96,7 +84,7 @@ private void AddNode(Node node) private void EndCurrentNode() { if (_current != _root) - _current = _current.Parent!; + _current = (Node) _current.Parent!; } public IEnumerable> Traverse() @@ -104,7 +92,7 @@ public IEnumerable> Traverse() return Traverse(_root); } - private static IEnumerable> Traverse(Node node) + private static IEnumerable> Traverse(INodeView node) { yield return new AbstractSyntaxTreeNodeView(node.Text, node.Type); var childNodes = node.Children.SelectMany(Traverse).ToList(); @@ -116,50 +104,18 @@ private static IEnumerable> Traverse(Node node) public IAbstractSyntaxTree AddRule(ISyntaxRule rule) { - return new MdAbstractSyntaxTree(this, rule); + _rules = _rules.Add(rule); + return this; } public IAbstractSyntaxTree ApplyRules() { - var newSyntaxTree = new MdAbstractSyntaxTree(); - ProcessChildNodes(_root, newSyntaxTree); - return newSyntaxTree; - } - - private void ProcessChildNodes(Node node, MdAbstractSyntaxTree newSyntaxTree) - { - var parentNodeView = new AbstractSyntaxTreeNodeView(node.Text, node.Type); - - for (var i = 0; i < node.Children.Count; i++) - { - var childNode = node.Children[i]; - var childNodeView = new AbstractSyntaxTreeNodeView(childNode.Text, childNode.Type); - var leftNeighbourView = i > 0 - ? new AbstractSyntaxTreeNodeView(node.Children[i - 1].Text, node.Children[i - 1].Type) - : null; - var rightNeighbourView = i < node.Children.Count - 1 - ? new AbstractSyntaxTreeNodeView(node.Children[i + 1].Text, node.Children[i + 1].Type) - : null; - var result = childNodeView; - var shouldCopy = false; - foreach (var rule in _rules) - (result, shouldCopy) = rule.Apply(result, parentNodeView, leftNeighbourView, rightNeighbourView); - - var newNode = new Node(result); - newSyntaxTree._current.Children.Add(newNode); - newNode.Parent = newSyntaxTree._current; - - if (newNode.Type != MdTokenType.PlainText && newNode.Type != MdTokenType.Document) - newSyntaxTree._current = newNode; - - if (childNode.Children.Count > 0) - ProcessChildNodes(childNode, newSyntaxTree); - - if (newNode.Parent.Type != MdTokenType.PlainText && newNode.Parent.Type != MdTokenType.Document) - newSyntaxTree._current = newNode.Parent; - - if (shouldCopy) - newSyntaxTree._current.Children.Add(newNode); - } + INodeView syntaxTree = _root; + syntaxTree = _rules + .Aggregate(syntaxTree, + (current, rule) => rule.Apply(current)); + _root = (Node) syntaxTree; + _current = _root; + return this; } } \ No newline at end of file diff --git a/Markdown/Markdown/NodeView/INodeView.cs b/Markdown/Markdown/NodeView/INodeView.cs new file mode 100644 index 000000000..3840158ee --- /dev/null +++ b/Markdown/Markdown/NodeView/INodeView.cs @@ -0,0 +1,10 @@ +namespace Markdown.NodeView; + +public interface INodeView +{ + public ReadOnlyMemory Text { get; set; } + public TTokenType Type { get; set; } + public bool InsideWord { get; set; } + public List> Children { get; set; } + public INodeView? Parent { get; set; } +} \ No newline at end of file diff --git a/Markdown/Markdown/ParseTree/IParseTree.cs b/Markdown/Markdown/ParseTree/IParseTree.cs index dab54aecc..a34685935 100644 --- a/Markdown/Markdown/ParseTree/IParseTree.cs +++ b/Markdown/Markdown/ParseTree/IParseTree.cs @@ -6,6 +6,6 @@ namespace Markdown.ParseTree; public interface IParseTree : ITraversable> { public ParseTreeNodeView CurrentToken { get; } - public void OpenToken(TTokenType tokenType, ReadOnlyMemory text); + public void OpenToken(TTokenType tokenType, ReadOnlyMemory text, bool insideWord = false); public void CloseCurrentToken(bool complete); } \ No newline at end of file diff --git a/Markdown/Markdown/ParseTree/MdParseTree.cs b/Markdown/Markdown/ParseTree/MdParseTree.cs index 0b424d4ab..dff46b1df 100644 --- a/Markdown/Markdown/ParseTree/MdParseTree.cs +++ b/Markdown/Markdown/ParseTree/MdParseTree.cs @@ -9,6 +9,7 @@ private class Node { public Node(MdTokenType type, bool complete = false, + bool insideWord = false, ReadOnlyMemory? text = null, Node? parent = null ) @@ -16,6 +17,7 @@ public Node(MdTokenType type, Type = type; Children = new List(); Complete = complete; + InsideWord = insideWord; Text = text ?? ReadOnlyMemory.Empty; Parent = parent; } @@ -23,6 +25,7 @@ public Node(MdTokenType type, public ReadOnlyMemory Text { get; set; } public MdTokenType Type { get; set; } public bool Complete { get; set; } + public bool InsideWord { get; set; } public List Children { get; set; } public Node? Parent { get; set; } } @@ -37,11 +40,11 @@ public MdParseTree() private Node _current; public ParseTreeNodeView CurrentToken => - new(_current.Text, _current.Type, _current.Children.Count == 0, _current.Complete); + new(_current.Text, _current.Type, _current.Children.Count == 0, _current.Complete, _current.InsideWord); - public void OpenToken(MdTokenType tokenType, ReadOnlyMemory text) + public void OpenToken(MdTokenType tokenType, ReadOnlyMemory text, bool insideWord = false) { - var newNode = new Node(tokenType, false, text, _current); + var newNode = new Node(tokenType, false, insideWord, text, _current); _current.Children.Add(newNode); _current = newNode; } @@ -61,7 +64,8 @@ public IEnumerable> Traverse() private static IEnumerable> Traverse(Node node) { - yield return new ParseTreeNodeView(node.Text, node.Type, node.Children.Count == 0, node.Complete); + yield return new ParseTreeNodeView( + node.Text, node.Type, node.Children.Count == 0, node.Complete, node.InsideWord); var childNodes = node.Children.SelectMany(Traverse).ToList(); foreach (var childNode in childNodes) yield return childNode; diff --git a/Markdown/Markdown/ParseTree/ParseTreeNodeView.cs b/Markdown/Markdown/ParseTree/ParseTreeNodeView.cs index 061370887..01fd3d6fd 100644 --- a/Markdown/Markdown/ParseTree/ParseTreeNodeView.cs +++ b/Markdown/Markdown/ParseTree/ParseTreeNodeView.cs @@ -6,4 +6,5 @@ public record ParseTreeNodeView( ReadOnlyMemory Text, TTokenType TokenType, bool Empty, - bool Complete) : BaseNodeView; \ No newline at end of file + bool Complete, + bool insideWord) : BaseNodeView; \ No newline at end of file diff --git a/Markdown/Markdown/Parser/MdParser.cs b/Markdown/Markdown/Parser/MdParser.cs index 2e5cfb0e4..d9ee024fb 100644 --- a/Markdown/Markdown/Parser/MdParser.cs +++ b/Markdown/Markdown/Parser/MdParser.cs @@ -56,7 +56,7 @@ public IParseTree Parse(IEnumerable tokens) if (parseTree.CurrentToken.TokenType == token.Type) parseTree.CloseCurrentToken(true); else - parseTree.OpenToken(token.Type, token.Text); + parseTree.OpenToken(token.Type, token.Text, true); } else { diff --git a/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs b/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs index 1b620a143..7b6f7c47d 100644 --- a/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs +++ b/Markdown/Markdown/SyntaxRules/ISyntaxRule.cs @@ -1,12 +1,9 @@ using Markdown.AbstractSyntaxTree; +using Markdown.NodeView; namespace Markdown.SyntaxRules; public interface ISyntaxRule { - public (AbstractSyntaxTreeNodeView result, bool shouldCopy) Apply( - AbstractSyntaxTreeNodeView node, - AbstractSyntaxTreeNodeView parentNode, - AbstractSyntaxTreeNodeView? leftNeighbour, - AbstractSyntaxTreeNodeView? rightNeighbour); + public INodeView Apply(INodeView nodeView); } \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/NestingRule.cs b/Markdown/Markdown/SyntaxRules/NestingRule.cs index b74e3fdb1..64364d179 100644 --- a/Markdown/Markdown/SyntaxRules/NestingRule.cs +++ b/Markdown/Markdown/SyntaxRules/NestingRule.cs @@ -1,19 +1,31 @@ -using Markdown.AbstractSyntaxTree; +using Markdown.NodeView; using Markdown.Token; namespace Markdown.SyntaxRules; public class NestingRule : ISyntaxRule { - public (AbstractSyntaxTreeNodeView result, bool shouldCopy) Apply( - AbstractSyntaxTreeNodeView node, - AbstractSyntaxTreeNodeView parentNode, - AbstractSyntaxTreeNodeView? leftNeighbour, - AbstractSyntaxTreeNodeView? rightNeighbour) + public INodeView Apply(INodeView nodeView) { - if (node.TokenType == MdTokenType.Bold && parentNode.TokenType == MdTokenType.Italic) - return (new AbstractSyntaxTreeNodeView(node.Text, MdTokenType.PlainText), true); + for (var i = 0; i < nodeView.Children.Count; i++) + { + var childNode = nodeView.Children[i]; + if (childNode.Type == MdTokenType.Bold && nodeView.Type == MdTokenType.Italic) + { + childNode.Type = MdTokenType.PlainText; + foreach (var toMove in childNode.Children) + { + nodeView.Children.Insert(i + 1, toMove); + toMove.Parent = nodeView; + } + + nodeView.Children.Insert(i + 1 + childNode.Children.Count, childNode); + childNode.Children.Clear(); + } + + Apply(childNode); + } - return (node, false); + return nodeView; } } \ No newline at end of file From f106f80dfadb37ab95f4358a7e6e8e52a73f891d Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sun, 8 Dec 2024 15:05:22 +0500 Subject: [PATCH 17/26] added new syntax rules and fixed potential bugs --- .../MdAbstractSyntaxTree.cs | 1 + .../Extensions/ReadOnlyMemoryExtensions.cs | 34 ++++++++++++++++++ Markdown/Markdown/Md.cs | 12 ++++--- Markdown/Markdown/Program.cs | 9 ++++- Markdown/Markdown/SyntaxRules/NestingRule.cs | 2 +- Markdown/Markdown/SyntaxRules/NumberRule.cs | 34 ++++++++++++++++++ .../SyntaxRules/TokensInDifferentWordsRule.cs | 35 +++++++++++++++++++ Markdown/Markdown/Tokenizer/MdTokenizer.cs | 2 +- 8 files changed, 121 insertions(+), 8 deletions(-) create mode 100644 Markdown/Markdown/Extensions/ReadOnlyMemoryExtensions.cs create mode 100644 Markdown/Markdown/SyntaxRules/NumberRule.cs create mode 100644 Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs index 78c8c87d5..74de42377 100644 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs +++ b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree.cs @@ -116,6 +116,7 @@ public IAbstractSyntaxTree ApplyRules() (current, rule) => rule.Apply(current)); _root = (Node) syntaxTree; _current = _root; + _rules = ImmutableList>.Empty; return this; } } \ No newline at end of file diff --git a/Markdown/Markdown/Extensions/ReadOnlyMemoryExtensions.cs b/Markdown/Markdown/Extensions/ReadOnlyMemoryExtensions.cs new file mode 100644 index 000000000..3d71a4769 --- /dev/null +++ b/Markdown/Markdown/Extensions/ReadOnlyMemoryExtensions.cs @@ -0,0 +1,34 @@ +using System.Runtime.InteropServices; + +namespace Markdown.Extensions; + +public static class ReadOnlyMemoryExtensions +{ + public static bool Contains(this ReadOnlyMemory memory, char value) + { + ArgumentExceptionHelpers.ThrowIfFalse( + MemoryMarshal.TryGetString(memory, out var str, out var start, out var length), + "Underlying object in the input argument is not a string"); + for (var i = start; i < start + length; i++) + { + if (str![i] == value) + return true; + } + + return false; + } + + public static bool ContainsNumber(this ReadOnlyMemory memory) + { + ArgumentExceptionHelpers.ThrowIfFalse( + MemoryMarshal.TryGetString(memory, out var str, out var start, out var length), + "Underlying object in the input argument is not a string"); + for (var i = start; i < start + length; i++) + { + if (int.TryParse(str![i].ToString(), out _)) + return true; + } + + return false; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Md.cs b/Markdown/Markdown/Md.cs index 8509cb1a6..c4ad3f84b 100644 --- a/Markdown/Markdown/Md.cs +++ b/Markdown/Markdown/Md.cs @@ -11,18 +11,20 @@ namespace Markdown; public class Md( Dictionary tokenTags, ITokenizer tokenizer, - IParser parser) : IRenderer + IParser parser, + ISyntaxRule[] syntaxRules) : IRenderer { public string Render(string input) { var tokens = tokenizer.Tokenize(input.AsMemory()); var parseTree = parser.Parse(tokens); - var syntaxTree = MdAbstractSyntaxTree - .FromParseTree(parseTree) - .AddRule(new NestingRule()) - .ApplyRules(); + var syntaxTree = MdAbstractSyntaxTree.FromParseTree(parseTree); + + foreach (var syntaxRule in syntaxRules) + syntaxTree.AddRule(syntaxRule); return syntaxTree + .ApplyRules() .Traverse() .Aggregate(new StringBuilder(), (sb, node) => ProcessNode(node, sb)) diff --git a/Markdown/Markdown/Program.cs b/Markdown/Markdown/Program.cs index d70a86f7a..2875cbfe8 100644 --- a/Markdown/Markdown/Program.cs +++ b/Markdown/Markdown/Program.cs @@ -1,6 +1,7 @@ using Markdown; using Markdown.Parser; using Markdown.ParseTree; +using Markdown.SyntaxRules; using Markdown.Token; using Markdown.Tokenizer; @@ -15,5 +16,11 @@ tokenTags.Add(MdTokenType.Bold, "strong"); tokenTags.Add(MdTokenType.Heading, "h1"); -var md = new Md(tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree())); +var syntaxRules = new List>(); +syntaxRules.Add(new NestingRule()); +syntaxRules.Add(new NumberRule()); +syntaxRules.Add(new TokensInDifferentWordsRule()); + +var md = new Md( + tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree()), syntaxRules.ToArray()); Console.WriteLine(md.Render("# Hello World! _some words_ in italics\n__some other text__")); \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/NestingRule.cs b/Markdown/Markdown/SyntaxRules/NestingRule.cs index 64364d179..0b5ee941b 100644 --- a/Markdown/Markdown/SyntaxRules/NestingRule.cs +++ b/Markdown/Markdown/SyntaxRules/NestingRule.cs @@ -13,7 +13,7 @@ public INodeView Apply(INodeView nodeView) if (childNode.Type == MdTokenType.Bold && nodeView.Type == MdTokenType.Italic) { childNode.Type = MdTokenType.PlainText; - foreach (var toMove in childNode.Children) + foreach (var toMove in childNode.Children.AsEnumerable().Reverse()) { nodeView.Children.Insert(i + 1, toMove); toMove.Parent = nodeView; diff --git a/Markdown/Markdown/SyntaxRules/NumberRule.cs b/Markdown/Markdown/SyntaxRules/NumberRule.cs new file mode 100644 index 000000000..2bdda018e --- /dev/null +++ b/Markdown/Markdown/SyntaxRules/NumberRule.cs @@ -0,0 +1,34 @@ +using Markdown.Extensions; +using Markdown.NodeView; +using Markdown.Token; + +namespace Markdown.SyntaxRules; + +public class NumberRule : ISyntaxRule +{ + public INodeView Apply(INodeView nodeView) + { + for (var i = 0; i < nodeView.Children.Count; i++) + { + var childNode = nodeView.Children[i]; + if (childNode is { InsideWord: true, Type: MdTokenType.Bold or MdTokenType.Italic } + && childNode.Children.Any( + n => n.Text.ContainsNumber())) + { + childNode.Type = MdTokenType.PlainText; + foreach (var toMove in childNode.Children.AsEnumerable().Reverse()) + { + nodeView.Children.Insert(i + 1, toMove); + toMove.Parent = nodeView; + } + + nodeView.Children.Insert(i + 1 + childNode.Children.Count, childNode); + childNode.Children.Clear(); + } + + Apply(childNode); + } + + return nodeView; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs b/Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs new file mode 100644 index 000000000..d748a2809 --- /dev/null +++ b/Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs @@ -0,0 +1,35 @@ +using Markdown.Extensions; +using Markdown.NodeView; +using Markdown.Token; + +namespace Markdown.SyntaxRules; + +public class TokensInDifferentWordsRule : ISyntaxRule +{ + private static readonly char[] Delimiters = new[] { ' ', '\t', '\r', '\n', ',', '.', '!', '?' }; + public INodeView Apply(INodeView nodeView) + { + for (var i = 0; i < nodeView.Children.Count; i++) + { + var childNode = nodeView.Children[i]; + if (childNode is { InsideWord: true, Type: MdTokenType.Bold or MdTokenType.Italic } + && childNode.Children.Any( + n => Delimiters.Any(x => n.Text.Contains(x)))) + { + childNode.Type = MdTokenType.PlainText; + foreach (var toMove in childNode.Children.AsEnumerable().Reverse()) + { + nodeView.Children.Insert(i + 1, toMove); + toMove.Parent = nodeView; + } + + nodeView.Children.Insert(i + 1 + childNode.Children.Count, childNode); + childNode.Children.Clear(); + } + + Apply(childNode); + } + + return nodeView; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer.cs b/Markdown/Markdown/Tokenizer/MdTokenizer.cs index 7eff4a465..5c3e823fd 100644 --- a/Markdown/Markdown/Tokenizer/MdTokenizer.cs +++ b/Markdown/Markdown/Tokenizer/MdTokenizer.cs @@ -14,7 +14,7 @@ public IEnumerable Tokenize(ReadOnlyMemory input) var foundPlainText = false; var plainTextStart = 0; var increment = 1; - for (var i = start; i < length; ) + for (var i = start; i < start + length; ) { if (escapeCharacter == str![i] && i + 1 < str.Length) { From 8b6a4d8170f7f84e34e1aa6b2d13ced397ab6171 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sun, 8 Dec 2024 15:05:47 +0500 Subject: [PATCH 18/26] added descriptions to tests --- Markdown/Markdown.Tests/MdTests.cs | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Markdown/Markdown.Tests/MdTests.cs b/Markdown/Markdown.Tests/MdTests.cs index 23d9684fe..3e340708c 100644 --- a/Markdown/Markdown.Tests/MdTests.cs +++ b/Markdown/Markdown.Tests/MdTests.cs @@ -5,6 +5,7 @@ using FluentAssertions; using Markdown.Parser; using Markdown.ParseTree; +using Markdown.SyntaxRules; using Markdown.Token; using Markdown.Tokenizer; using NUnit.Framework; @@ -30,11 +31,18 @@ public void SetUp() tokenTags.Add(MdTokenType.Italic, "em"); tokenTags.Add(MdTokenType.Bold, "strong"); tokenTags.Add(MdTokenType.Heading, "h1"); + + var syntaxRules = new List>(); + syntaxRules.Add(new NestingRule()); + syntaxRules.Add(new NumberRule()); + syntaxRules.Add(new TokensInDifferentWordsRule()); - _md = new Md(tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree())); + _md = new Md( + tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree()), syntaxRules.ToArray()); } [Test] + [Description("Базовые тесты")] [TestCase("", "")] [TestCase("Hello world", "Hello world")] [TestCase("Hello _world_!", "Hello world!")] @@ -49,6 +57,7 @@ public void Render_ReturnsCorrectMarkdown_ForSimpleCases( } [Test] + [Description("Тесты на вложенность двойного и одинарного выделения")] [TestCase("This __text _contains_ nested__ markdown", "This text contains nested markdown")] [TestCase("This is _an example __of inversed__ nested_ markdown", "This is an example __of inversed__ nested markdown")] public void Render_ReturnsCorrectMarkdown_ForCasesWithNesting( @@ -61,10 +70,10 @@ public void Render_ReturnsCorrectMarkdown_ForCasesWithNesting( } [Test] + [Description("Тесты для разметки внутри текста с цифрами")] [TestCase("Text_12_3", "Text_12_3")] - [TestCase("This _Text_12_3_ should be italic", "This Text_12_3 should be italic")] [TestCase("5__12_3__4", "5__12_3__4")] - [TestCase("Text __that_12__3__ is in bold", "Text that_12__3 is in bold")] + [TestCase("Text __that_12_3__ is in bold", "Text that_12_3 is in bold")] public void Render_ReturnsCorrectMarkdown_ForTextWithNumbers( string input, string expectedOutput) @@ -75,6 +84,7 @@ public void Render_ReturnsCorrectMarkdown_ForTextWithNumbers( } [Test] + [Description("Тесты для разметки внутри слов")] [TestCase("_begin_ning", "beginning")] [TestCase("mi_ddl_e", "middle")] [TestCase("end_ing_", "ending")] @@ -91,6 +101,7 @@ public void Render_ReturnsCorrectMarkdown_ForPartsOfWords( } [Test] + [Description("Тесты для подчерков, находящихся внутри разных слов")] [TestCase("This sh_ould not cha_nge", "This sh_ould not cha_nge")] [TestCase("As w__ell a__s this", "As w__ell a__s this")] [TestCase("This sh__o_uld_ wo__rk like this", "This sh__ould wo__rk like this")] @@ -104,6 +115,7 @@ public void Render_ReturnsCorrectMarkdown_ForMarkdownInDifferentWords( } [Test] + [Description("Тесты для непарных символов разметки")] [TestCase("__Unpaired_ markdown", "__Unpaired_ markdown")] [TestCase("Another _unpaired markdown__", "Another _unpaired markdown__")] public void Render_ReturnsCorrectMarkdown_ForUnpairedMarkdownSymbols( @@ -116,6 +128,7 @@ public void Render_ReturnsCorrectMarkdown_ForUnpairedMarkdownSymbols( } [Test] + [Description("Проверяем, что подчерки должны следовать за (стоять перед) непробельным символом")] [TestCase("This_ should not_ change", "This_ should not_ change")] [TestCase("This _should _be in_ italics", "This should _be in italics")] public void Render_ReturnsCorrectMarkdown_ForIncorrectlyPlacedUnderscores( @@ -128,6 +141,7 @@ public void Render_ReturnsCorrectMarkdown_ForIncorrectlyPlacedUnderscores( } [Test] + [Description("Тесты на пересечение двойных и одинарных подчерков")] [TestCase("Intersecting _markdown __should_ work__ like this", "Intersecting _markdown __should_ work__ like this")] [TestCase("Another __example of _intersecting__ markdown_", "Another __example of _intersecting__ markdown_")] public void Render_ReturnsCorrectMarkdown_ForIntersectingMarkdown( @@ -140,6 +154,7 @@ public void Render_ReturnsCorrectMarkdown_ForIntersectingMarkdown( } [Test] + [Description("Тесты на пустую разметку")] [TestCase("This should ____ remain the same", "This should ____ remain the same")] [TestCase("This also should __ not change", "This also should __ not change")] public void Render_ReturnsCorrectMarkdown_ForEmptyMarkdown( @@ -152,6 +167,7 @@ public void Render_ReturnsCorrectMarkdown_ForEmptyMarkdown( } [Test] + [Description("Тесты на экранирование")] [TestCase(@"This should \_not turn\_ into tags", "This should _not turn_ into tags")] [TestCase(@"This should \remain the\ same", @"This should \remain the\ same")] public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( @@ -164,9 +180,10 @@ public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( } [Test] + [Description("Тест на производительность")] public void Render_PerformanceTest() { - var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 50000); + var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 20000); Console.WriteLine($"Total length: {fullStr.Length}"); var totalTime = MeasureTime(fullStr); From 8efaaf30f96b5449dab377027650d2260d8428e0 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sun, 8 Dec 2024 15:08:21 +0500 Subject: [PATCH 19/26] deleted files with old version of existing code --- .../MdAbstractSyntaxTree_.cs | 155 ------------- Markdown/Markdown/Parser/MdParser_.cs | 50 ---- Markdown/Markdown/Tokenizer/MdTokenizer_.cs | 218 ------------------ 3 files changed, 423 deletions(-) delete mode 100644 Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs delete mode 100644 Markdown/Markdown/Parser/MdParser_.cs delete mode 100644 Markdown/Markdown/Tokenizer/MdTokenizer_.cs diff --git a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs b/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs deleted file mode 100644 index 0d56a2f38..000000000 --- a/Markdown/Markdown/AbstractSyntaxTree/MdAbstractSyntaxTree_.cs +++ /dev/null @@ -1,155 +0,0 @@ -using System.Collections.ObjectModel; -using System.Text; -using Markdown.Token; - -namespace Markdown.AbstractSyntaxTree; - -public class MdAbstractSyntaxTree_ -{ - public class Node - { - public Node() - { - Children = new List(); - } - - public Node(MdTokenType tokenType, ReadOnlyMemory? tokenValue) - { - TokenType = tokenType; - TokenValue = tokenValue; - Children = new List(); - } - - public MdTokenType? TokenType { get; set; } - public ReadOnlyMemory? TokenValue { get; } - public Node? Parent { get; set; } - public List Children { get; } - - public void AddChild(Node node) - { - node.Parent = this; - Children.Add(node); - } - - public IEnumerable RemoveChildren() - { - var children = new List(Children); - Children.Clear(); - return children; - } - } - - private readonly ReadOnlyDictionary _tokenTags; - private readonly Node _root; - private Node _current; - - public MdAbstractSyntaxTree_(ReadOnlyDictionary tokenTags) - { - _tokenTags = tokenTags; - _root = new Node(); - _current = _root; - } - - public void AddToken(MdTokenType mdTokenType, ReadOnlyMemory tokenValue) - { - ArgumentExceptionHelpers.ThrowIfNull(tokenValue, "tokenValue must not be null"); - if (mdTokenType == MdTokenType.PlainText) - { - _current.AddChild(new Node(mdTokenType, tokenValue)); - } - else - { - var newNode = new Node(mdTokenType, tokenValue); - _current.AddChild(newNode); - _current = newNode; - } - } - - public bool HasTokenInContext(MdTokenType mdTokenType) => HasParent(mdTokenType, _current); - - private bool HasParent(MdTokenType mdTokenType, Node node) - { - if (node == _root) - return false; - if (node.TokenType == mdTokenType) - return true; - return HasParent(mdTokenType, node.Parent!); - } - - public void EndToken(MdTokenType? mdTokenType = null) - { - WalkUpToTheRoot(_current, mdTokenType); - } - - public List GetChildrenForCurrentToken() - { - return _current.Children; - } - - private void WalkUpToTheRoot(Node node, MdTokenType? mdTokenType) - { - if (node == _root) - _current = _root; - else if (node.TokenType == mdTokenType) - { - if (node.TokenType == MdTokenType.Italic) - { - var childNodes = new List(node.Children); - for (var i = 0; i < childNodes.Count; i++) - { - var childNode = childNodes[i]; - if (childNode.TokenType == MdTokenType.Bold) - { - var children = childNode.RemoveChildren().ToList(); - foreach (var child in children) - { - node.Children.Insert(i + 1, child); - child.Parent = node; - } - childNode.TokenType = MdTokenType.PlainText; - node.Children.Insert(i + 1 + children.Count, childNode); - } - } - } - _current = node.Parent!; - } - else - { - var parent = node.Parent!; - var children = node.RemoveChildren(); - foreach (var child in children) - parent.AddChild(child); - WalkUpToTheRoot(parent, mdTokenType); - } - } - - public void ReviseForIntersection(MdTokenType tokenType, ReadOnlyMemory tokenText) - { - throw new NotImplementedException(); - } - - public string ToText() - { - var sb = new StringBuilder(); - ProcessChildren(_root, sb); - return sb.ToString(); - } - - private void ProcessChildren(Node node, StringBuilder sb) - { - foreach (var child in node.Children) - { - if (child.TokenType == MdTokenType.PlainText || child.Children.Count == 0) - sb.Append(child.TokenValue); - else - SurroundWithTag(_tokenTags[child.TokenType!.Value], child, sb); - } - } - - private void SurroundWithTag(string tag, Node node, StringBuilder sb) - { - sb.Append($"<{tag}>"); - ProcessChildren(node, sb); - sb.Append($""); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/Parser/MdParser_.cs b/Markdown/Markdown/Parser/MdParser_.cs deleted file mode 100644 index 8affc1eab..000000000 --- a/Markdown/Markdown/Parser/MdParser_.cs +++ /dev/null @@ -1,50 +0,0 @@ -using Markdown.AbstractSyntaxTree; -using Markdown.Token; - -namespace Markdown.Parser; - -public class MdParser_ -{ - public MdAbstractSyntaxTree_ Parse(IEnumerable tokens, MdAbstractSyntaxTree_ syntaxTree) - { - foreach (var token in tokens) - { - if (token.Behaviour == MdTokenBehaviour.Closing && syntaxTree.HasTokenInContext(token.Type)) - { - if (syntaxTree.GetChildrenForCurrentToken().Count == 0) - { - syntaxTree.EndToken(token.Type); - syntaxTree.AddToken(MdTokenType.PlainText, token.Text); - } - else - syntaxTree.EndToken(token.Type); - } - else if (token.Behaviour == MdTokenBehaviour.Opening && !syntaxTree.HasTokenInContext(token.Type)) - { - syntaxTree.AddToken(token.Type, token.Text); - } - else if (token.Behaviour == MdTokenBehaviour.Closing && !syntaxTree.HasTokenInContext(token.Type)) - { - // TODO intersecting markdown - syntaxTree.EndToken(); - syntaxTree.AddToken(MdTokenType.PlainText, token.Text); - } - else if (token.Behaviour == MdTokenBehaviour.Opening && syntaxTree.HasTokenInContext(token.Type)) - { - syntaxTree.AddToken(MdTokenType.PlainText, token.Text); - } - else if (token.Type == MdTokenType.PlainText || token.Behaviour == MdTokenBehaviour.Undefined) - { - syntaxTree.AddToken(MdTokenType.PlainText, token.Text); - } - else - { - throw new ArgumentException( - $"Unexpected token of type {token.Type}, with behaviour {token.Behaviour}, " + - $"with text: {token.Text}"); - } - } - - return syntaxTree; - } -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer_.cs b/Markdown/Markdown/Tokenizer/MdTokenizer_.cs deleted file mode 100644 index fbe1b1927..000000000 --- a/Markdown/Markdown/Tokenizer/MdTokenizer_.cs +++ /dev/null @@ -1,218 +0,0 @@ -using System.Collections.ObjectModel; -using System.Runtime.InteropServices; -using Markdown.Token; - -namespace Markdown.Tokenizer; - -public class MdTokenizer_( - ReadOnlyDictionary StartTokenAliases, - ReadOnlyDictionary EndTokenAliases - ) -{ - public IEnumerable Tokenize(ReadOnlyMemory input) - { - ArgumentExceptionHelpers.ThrowIfFalse( - MemoryMarshal.TryGetString(input, out var str, out var start, out var length), - "Underlying object in the input argument is not a string"); - - var foundPlainText = false; - var plainTextStart = 0; - var increment = 1; - for (var i = start; i < length; ) - { - if (TryMatchTokenAliases(str!, i, out var tokenType, out var tokenAlias, out var tokenBehaviour)) - { - increment = tokenAlias.Length; - - if (foundPlainText) - yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, - input.Slice(plainTextStart, i - plainTextStart)); - - yield return new MdToken(tokenType, tokenBehaviour, input.Slice(i, tokenAlias.Length)); - - foundPlainText = false; - } - else - UpdatePlainTextState(ref foundPlainText, ref i, ref plainTextStart); - - i += increment; - if (increment > 1) - increment = 1; - } - - if (foundPlainText) - yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, - input.Slice(plainTextStart, str!.Length - plainTextStart)); - } - - private void UpdatePlainTextState(ref bool foundPlainText, ref int index, ref int plainTextStart) - { - if (!foundPlainText) - plainTextStart = index; - foundPlainText = true; - } - - private bool TryMatchTokenAliases( - string input, - int index, - out MdTokenType mdTokenType, - out string tokenAlias, - out MdTokenBehaviour behaviour) - { - var matchedEndToken = false; - var mathcedStartToken = false; - - var startTokenType = default(MdTokenType); - var endTokenType = default(MdTokenType); - - var endToken = true; - if (TryMatchTokenAliases(input, index, endToken, out var endTokenAlias)) - matchedEndToken = EndTokenAliases.TryGetValue(endTokenAlias, out endTokenType); - - endToken = false; - if (TryMatchTokenAliases(input, index, endToken, out var startTokenAlias) - && EnsureNotInSeparatedWords(input, index, startTokenAlias)) - mathcedStartToken = StartTokenAliases.TryGetValue(startTokenAlias, out startTokenType); - - // это случается если элемент разметки находится внутри слова или стоит отдельно: "слово __ слово" - if (matchedEndToken && mathcedStartToken) - { - if (startTokenAlias.Length > endTokenAlias.Length) - { - behaviour = MdTokenBehaviour.Opening; - tokenAlias = startTokenAlias; - mdTokenType = startTokenType; - return true; - } - - if (startTokenAlias.Length == endTokenAlias.Length) - { - behaviour = MdTokenBehaviour.Undefined; - tokenAlias = startTokenAlias; - mdTokenType = startTokenType; - return true; - } - - behaviour = MdTokenBehaviour.Closing; - tokenAlias = endTokenAlias; - mdTokenType = endTokenType; - return true; - } - - if (matchedEndToken) - { - behaviour = MdTokenBehaviour.Closing; - tokenAlias = endTokenAlias; - mdTokenType = endTokenType; - return true; - } - - if (mathcedStartToken) - { - behaviour = MdTokenBehaviour.Opening; - tokenAlias = startTokenAlias; - mdTokenType = startTokenType; - return true; - } - - behaviour = MdTokenBehaviour.Undefined; - tokenAlias = String.Empty; - mdTokenType = MdTokenType.PlainText; - return false; - } - - private bool TryMatchTokenAliases( - string input, - int index, - bool endToken, - out string tokenAlias) - { - tokenAlias = String.Empty; - var tokenAliases = endToken ? EndTokenAliases : StartTokenAliases; - foreach (var alias in tokenAliases.Keys) - { - if (TryMatchAlias(input, index, alias, endToken) && tokenAlias.Length < alias.Length) - tokenAlias = alias; - } - - return !String.IsNullOrEmpty(tokenAlias); - } - - /* - * для случаев типа: "сл_ово дру_гое слово", когда подчерки должны оставаться подчерками - */ - private bool EnsureNotInSeparatedWords(string input, int index, string alias) - { - if (!IsInsideAWord(input, index, alias)) - return true; - - var i = index + alias.Length; - while (i < input.Length && !IsWordDelimiter(input[i])) - { - if (TryMatchAlias(input, i, alias, true)) - return true; - i++; - } - - return false; - } - - private bool TryMatchAlias(string input, int index, string alias, bool isEndToken) - { - return TryMatchPattern(input, index, alias) - && (!IsBoldOrItalicAlias(alias, isEndToken) - || (IsInsideAWord(input, index, alias) - && !IsSurroundedByNumbers(input, index, alias)) - || HasAWhiteSpaceNearIt(input, index, alias, isEndToken)); - } - - private bool IsWordDelimiter(char c) - { - return c is ' ' or '\t' or '\n' or '\r'; - } - - private bool IsInsideAWord(string input, int index, string alias) - { - var trueForLeftEdge = index - 1 >= 0 && input[index - 1] != ' '; - var trueForRightEdge = index + alias.Length < input.Length - && input[index + alias.Length] != ' '; - return trueForLeftEdge && trueForRightEdge; - } - - private bool HasAWhiteSpaceNearIt(string input, int index, string alias, bool isEndToken) - { - var trueForLeftEdge = index - 1 < 0 || input[index - 1] == ' '; - var trueForRightEdge = index + alias.Length >= input.Length - || input[index + alias.Length] == ' '; - return isEndToken ? trueForRightEdge : trueForLeftEdge; - } - - private bool IsSurroundedByNumbers(string input, int index, string alias) - { - var trueForLeftEdge = index - 1 >= 0 - && int.TryParse(input[index - 1].ToString(), out _); - var trueForRightEdge = index + alias.Length < input.Length - && int.TryParse(input[index + alias.Length].ToString(), out _); - return trueForLeftEdge || trueForRightEdge; - } - - private bool IsBoldOrItalicAlias(string alias, bool isEndToken) - { - var tokenAliases = isEndToken ? EndTokenAliases : StartTokenAliases; - return tokenAliases[alias] == MdTokenType.Bold - || tokenAliases[alias] == MdTokenType.Italic; - } - - private bool TryMatchPattern(string input, int index, string pattern) - { - int i = 0; - foreach (var ch in pattern) - { - if (index + i >= input.Length || ch != input[index + i]) - return false; - i++; - } - - return true; - } -} \ No newline at end of file From 8549226b9f10c640b4c54536fe97d90cb92a397f Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sun, 8 Dec 2024 15:11:39 +0500 Subject: [PATCH 20/26] removed old files --- .../Markdown/TokenConverters/BoldHtmlConverter.cs | 13 ------------- .../TokenConverters/HeadingHtmlConverter.cs | 13 ------------- .../Markdown/TokenConverters/HtmlTokenConverter.cs | 11 ----------- .../Markdown/TokenConverters/ITokenConverter.cs | 9 --------- .../TokenConverters/ItalicHtmlConverter.cs | 13 ------------- Markdown/Markdown/Tokenizers/BoldTokenizer.cs | 13 ------------- Markdown/Markdown/Tokenizers/HeadingTokenizer.cs | 13 ------------- Markdown/Markdown/Tokenizers/ITokenizer.cs | 8 -------- Markdown/Markdown/Tokenizers/ItalicTokenizer.cs | 14 -------------- Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs | 10 ---------- Markdown/Markdown/Tokens/MarkdownTokenType.cs | 9 --------- Markdown/Markdown/Tokens/Token.cs | 13 ------------- 12 files changed, 139 deletions(-) delete mode 100644 Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs delete mode 100644 Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs delete mode 100644 Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs delete mode 100644 Markdown/Markdown/TokenConverters/ITokenConverter.cs delete mode 100644 Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs delete mode 100644 Markdown/Markdown/Tokenizers/BoldTokenizer.cs delete mode 100644 Markdown/Markdown/Tokenizers/HeadingTokenizer.cs delete mode 100644 Markdown/Markdown/Tokenizers/ITokenizer.cs delete mode 100644 Markdown/Markdown/Tokenizers/ItalicTokenizer.cs delete mode 100644 Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs delete mode 100644 Markdown/Markdown/Tokens/MarkdownTokenType.cs delete mode 100644 Markdown/Markdown/Tokens/Token.cs diff --git a/Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs b/Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs deleted file mode 100644 index 49755a6a0..000000000 --- a/Markdown/Markdown/TokenConverters/BoldHtmlConverter.cs +++ /dev/null @@ -1,13 +0,0 @@ -using System.Text; -using Markdown.Tokens; - -namespace Markdown.TokenConverters; - -public class BoldHtmlConverter : HtmlTokenConverter -{ - protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToBold; - public override void ProcessTokens(IEnumerable tokens, StringBuilder context) - { - throw new NotImplementedException(); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs b/Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs deleted file mode 100644 index 63ad202aa..000000000 --- a/Markdown/Markdown/TokenConverters/HeadingHtmlConverter.cs +++ /dev/null @@ -1,13 +0,0 @@ -using System.Text; -using Markdown.Tokens; - -namespace Markdown.TokenConverters; - -public class HeadingHtmlConverter : HtmlTokenConverter -{ - protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToHeading; - public override void ProcessTokens(IEnumerable tokens, StringBuilder context) - { - throw new NotImplementedException(); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs b/Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs deleted file mode 100644 index fec2e1a1b..000000000 --- a/Markdown/Markdown/TokenConverters/HtmlTokenConverter.cs +++ /dev/null @@ -1,11 +0,0 @@ -using System.Text; -using Markdown.Tokens; - -namespace Markdown.TokenConverters; - -public abstract class HtmlTokenConverter : ITokenConverter -{ - protected IEnumerable? TokensOnPreviousSlice { get; set; } - protected abstract MarkdownTokenType MarkdownTokenType { get; } - public abstract void ProcessTokens(IEnumerable tokens, StringBuilder context); -} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/ITokenConverter.cs b/Markdown/Markdown/TokenConverters/ITokenConverter.cs deleted file mode 100644 index 59279e627..000000000 --- a/Markdown/Markdown/TokenConverters/ITokenConverter.cs +++ /dev/null @@ -1,9 +0,0 @@ -using System.Text; -using Markdown.Tokens; - -namespace Markdown.TokenConverters; - -public interface ITokenConverter -{ - public void ProcessTokens(IEnumerable tokens, StringBuilder context); -} \ No newline at end of file diff --git a/Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs b/Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs deleted file mode 100644 index f5da61016..000000000 --- a/Markdown/Markdown/TokenConverters/ItalicHtmlConverter.cs +++ /dev/null @@ -1,13 +0,0 @@ -using System.Text; -using Markdown.Tokens; - -namespace Markdown.TokenConverters; - -public class ItalicHtmlConverter : HtmlTokenConverter -{ - protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToItalic; - public override void ProcessTokens(IEnumerable tokens, StringBuilder context) - { - throw new NotImplementedException(); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/BoldTokenizer.cs b/Markdown/Markdown/Tokenizers/BoldTokenizer.cs deleted file mode 100644 index 54fe0fe0c..000000000 --- a/Markdown/Markdown/Tokenizers/BoldTokenizer.cs +++ /dev/null @@ -1,13 +0,0 @@ -using Markdown.Tokens; - -namespace Markdown.Tokenizers; - -public class BoldTokenizer : MarkdownTokenizer -{ - protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToBold; - - public override IEnumerable Tokenize(ReadOnlySpan input) - { - throw new NotImplementedException(); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/HeadingTokenizer.cs b/Markdown/Markdown/Tokenizers/HeadingTokenizer.cs deleted file mode 100644 index 9fd281b33..000000000 --- a/Markdown/Markdown/Tokenizers/HeadingTokenizer.cs +++ /dev/null @@ -1,13 +0,0 @@ -using Markdown.Tokens; - -namespace Markdown.Tokenizers; - -public class HeadingTokenizer : MarkdownTokenizer -{ - protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToHeading; - - public override IEnumerable Tokenize(ReadOnlySpan input) - { - throw new NotImplementedException(); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/ITokenizer.cs b/Markdown/Markdown/Tokenizers/ITokenizer.cs deleted file mode 100644 index 79f231bb0..000000000 --- a/Markdown/Markdown/Tokenizers/ITokenizer.cs +++ /dev/null @@ -1,8 +0,0 @@ -using Markdown.Tokens; - -namespace Markdown.Tokenizers; - -public interface ITokenizer -{ - public IEnumerable Tokenize(ReadOnlySpan input); -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/ItalicTokenizer.cs b/Markdown/Markdown/Tokenizers/ItalicTokenizer.cs deleted file mode 100644 index 5d49725cb..000000000 --- a/Markdown/Markdown/Tokenizers/ItalicTokenizer.cs +++ /dev/null @@ -1,14 +0,0 @@ -using System.Text; -using Markdown.Tokens; - -namespace Markdown.Tokenizers; - -public class ItalicTokenizer : MarkdownTokenizer -{ - protected override MarkdownTokenType MarkdownTokenType { get; } = MarkdownTokenType.ToItalic; - - public override IEnumerable Tokenize(ReadOnlySpan input) - { - throw new NotImplementedException(); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs b/Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs deleted file mode 100644 index eddd05b28..000000000 --- a/Markdown/Markdown/Tokenizers/MarkdownTokenizer.cs +++ /dev/null @@ -1,10 +0,0 @@ -using System.Text; -using Markdown.Tokens; - -namespace Markdown.Tokenizers; - -public abstract class MarkdownTokenizer : ITokenizer -{ - protected abstract MarkdownTokenType MarkdownTokenType { get; } - public abstract IEnumerable Tokenize(ReadOnlySpan input); -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokens/MarkdownTokenType.cs b/Markdown/Markdown/Tokens/MarkdownTokenType.cs deleted file mode 100644 index 10eb9d8be..000000000 --- a/Markdown/Markdown/Tokens/MarkdownTokenType.cs +++ /dev/null @@ -1,9 +0,0 @@ -namespace Markdown.Tokens; - -public enum MarkdownTokenType -{ - NoConversion, - ToItalic, - ToBold, - ToHeading -} \ No newline at end of file diff --git a/Markdown/Markdown/Tokens/Token.cs b/Markdown/Markdown/Tokens/Token.cs deleted file mode 100644 index a21e3acd6..000000000 --- a/Markdown/Markdown/Tokens/Token.cs +++ /dev/null @@ -1,13 +0,0 @@ -namespace Markdown.Tokens; - -public readonly struct Token( - ReadOnlyMemory content, - MarkdownTokenType markdownTokenType, - int startIndex, - int endIndex) -{ - public ReadOnlyMemory Content { get; } = content; - public MarkdownTokenType MarkdownTokenType { get; } = markdownTokenType; - public int StartIndex { get; } = startIndex; - public int EndIndex { get; } = endIndex; -} \ No newline at end of file From f298b01888dbeb475e36ad037a43228d60f93455 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sun, 8 Dec 2024 15:34:51 +0500 Subject: [PATCH 21/26] deleted old files --- Markdown/Markdown.Tests/IMdTest.cs | 182 ----------------------------- Markdown/Markdown/IMd.cs | 6 - 2 files changed, 188 deletions(-) delete mode 100644 Markdown/Markdown.Tests/IMdTest.cs delete mode 100644 Markdown/Markdown/IMd.cs diff --git a/Markdown/Markdown.Tests/IMdTest.cs b/Markdown/Markdown.Tests/IMdTest.cs deleted file mode 100644 index 50880fa46..000000000 --- a/Markdown/Markdown.Tests/IMdTest.cs +++ /dev/null @@ -1,182 +0,0 @@ -using System; -using System.Diagnostics; -using System.Text; -using FluentAssertions; -using NUnit.Framework; - -namespace Markdown.Tests; - -[TestFixture] -[TestOf(typeof(IMd))] -public class IMdTest -{ - private IMd _imd; - - [SetUp] - public void SetUp() - { - _imd = new Md(); - } - - [Test] - [TestCase("", "")] - [TestCase("Hello world", "Hello world")] - [TestCase("Hello _world_!", "Hello world!")] - [TestCase("# _Hello_ __world__!", "

Hello world!

")] - public void Render_ReturnsCorrectMarkdown_ForSimpleCases( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - [TestCase("This __text _contains_ nested__ markdown", "This text contains nested markdown")] - [TestCase("This is _an example __of inversed__ nested_ markdown", "This is an example __of inversed__ nested markdown")] - public void Render_ReturnsCorrectMarkdown_ForCasesWithNesting( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - [TestCase("Text_12_3", "Text_12_3")] - [TestCase("This _Text_12_3_ should be italic", "This Text_12_3 should be italic")] - [TestCase("5__12_3__4", "5__12_3__4")] - [TestCase("Text __that_12__3__ is in bold", "Text that_12__3 is in bold")] - public void Render_ReturnsCorrectMarkdown_ForTextWithNumbers( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - [TestCase("_begin_ning", "beginning")] - [TestCase("mi_ddl_e", "middle")] - [TestCase("end_ing_", "ending")] - [TestCase("__begin__ning", "beginning")] - [TestCase("mi__ddl__e", "middle")] - [TestCase("end__ing__", "ending")] - public void Render_ReturnsCorrectMarkdown_ForPartsOfWords( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - [TestCase("This sh_ould not cha_nge", "This sh_ould not cha_nge")] - [TestCase("As w__ell a__s this", "As w__ell a__s this")] - [TestCase("This sh__o_uld_ wo__rk like this", "This sh__ould wo__rk like this")] - public void Render_ReturnsCorrectMarkdown_ForMarkdownInDifferentWords( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - [TestCase("__Unpaired_ markdown", "__Unpaired_ markdown")] - [TestCase("Another _unpaired markdown__", "Another _unpaired markdown__")] - public void Render_ReturnsCorrectMarkdown_ForUnpairedMarkdownSymbols( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - [TestCase("This_ should not_ change", "This_ should not_ change")] - [TestCase("This _should _be in_ italics", "This should _be in italics")] - public void Render_ReturnsCorrectMarkdown_ForIncorrectlyPlacedUnderscores( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - // Это пока не работает - [Test] - [TestCase("Intersecting _markdown __should_ work__ like this", "Intersecting _markdown __should_ work__ like this")] - [TestCase("Another __example of _intersecting__ markdown_", "Another __example of _intersecting__ markdown_")] - public void Render_ReturnsCorrectMarkdown_ForIntersectingMarkdown( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - [TestCase("This should ____ remain the same", "This should ____ remain the same")] - [TestCase("This also should __ not change", "This also should __ not change")] - public void Render_ReturnsCorrectMarkdown_ForEmptyMarkdown( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - // Это пока не работает - [Test] - [TestCase("This should \\_not turn\\_ into tags", "This should _not turn into tags")] - [TestCase("This should \\\\remain the\\\\ same", "This should \\\\remain the\\\\ same")] - public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( - string input, - string expectedOutput) - { - _imd.Render(input) - .Should() - .Be(expectedOutput); - } - - [Test] - public void Render_PerformanceTest() - { - var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 100000); - Console.WriteLine($"Total length: {fullStr.Length}"); - - var totalTime = MeasureTime(fullStr); - Console.WriteLine($"Time elapsed in ms: {totalTime}"); - - totalTime - .Should() - .BeLessThan(1000); - } - - private long MeasureTime(string fullStr) - { - var sw = new Stopwatch(); - sw.Start(); - _imd.Render(fullStr); - sw.Stop(); - return sw.ElapsedMilliseconds; - } - - private string ArrangePerformanceTest(string input, int copyCount) - { - var sb = new StringBuilder(); - for (var i = 0; i < copyCount; i++) - sb.Append(input); - return sb.ToString(); - } -} \ No newline at end of file diff --git a/Markdown/Markdown/IMd.cs b/Markdown/Markdown/IMd.cs deleted file mode 100644 index c77626d39..000000000 --- a/Markdown/Markdown/IMd.cs +++ /dev/null @@ -1,6 +0,0 @@ -namespace Markdown; - -public interface IMd -{ - public string Render(string input); -} \ No newline at end of file From 9f425733b717b7af56b0c6b7629e991919ebcea5 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Wed, 11 Dec 2024 23:15:36 +0500 Subject: [PATCH 22/26] added factory method for Md --- Markdown/Markdown.Tests/MdTests.cs | 19 +----------- Markdown/Markdown/DefaultMdFactory.cs | 44 +++++++++++++++++++++++++++ Markdown/Markdown/Program.cs | 24 +-------------- 3 files changed, 46 insertions(+), 41 deletions(-) create mode 100644 Markdown/Markdown/DefaultMdFactory.cs diff --git a/Markdown/Markdown.Tests/MdTests.cs b/Markdown/Markdown.Tests/MdTests.cs index 3e340708c..cbf76b32d 100644 --- a/Markdown/Markdown.Tests/MdTests.cs +++ b/Markdown/Markdown.Tests/MdTests.cs @@ -21,24 +21,7 @@ public class MdTests [SetUp] public void SetUp() { - var tokenAliases = new Dictionary(); - tokenAliases.Add("_", MdTokenType.Italic); - tokenAliases.Add("__", MdTokenType.Bold); - tokenAliases.Add("# ", MdTokenType.Heading); - tokenAliases.Add("\n", MdTokenType.Line); - - var tokenTags = new Dictionary(); - tokenTags.Add(MdTokenType.Italic, "em"); - tokenTags.Add(MdTokenType.Bold, "strong"); - tokenTags.Add(MdTokenType.Heading, "h1"); - - var syntaxRules = new List>(); - syntaxRules.Add(new NestingRule()); - syntaxRules.Add(new NumberRule()); - syntaxRules.Add(new TokensInDifferentWordsRule()); - - _md = new Md( - tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree()), syntaxRules.ToArray()); + _md = DefaultMdFactory.CreateMd(); } [Test] diff --git a/Markdown/Markdown/DefaultMdFactory.cs b/Markdown/Markdown/DefaultMdFactory.cs new file mode 100644 index 000000000..817dbed58 --- /dev/null +++ b/Markdown/Markdown/DefaultMdFactory.cs @@ -0,0 +1,44 @@ +using Markdown.Parser; +using Markdown.ParseTree; +using Markdown.SyntaxRules; +using Markdown.Token; +using Markdown.Tokenizer; + +namespace Markdown; + +public static class DefaultMdFactory +{ + private static readonly char[] _delimiters = { ' ', '\t', '\n', '\r', ',', '.', '!', '?' }; + + private static readonly Dictionary _tokenAliases = new Dictionary + { + { "_", MdTokenType.Italic }, + { "__", MdTokenType.Bold }, + { "# ", MdTokenType.Heading }, + { "\n", MdTokenType.Line } + }; + + private static readonly Dictionary _tokenTags = new Dictionary + { + { MdTokenType.Italic, "em" }, + { MdTokenType.Bold, "strong" }, + { MdTokenType.Heading, "h1" } + }; + + private static readonly List> _syntaxRules = new List> + { + new NestingRule(), + new NumberRule(), + new TokensInDifferentWordsRule() + }; + + public static Md CreateMd() + { + return new Md( + _tokenTags, + new MdTokenizer( + _tokenAliases, '\\', _delimiters), + new MdParser(new MdParseTree()), + _syntaxRules.ToArray()); + } +} \ No newline at end of file diff --git a/Markdown/Markdown/Program.cs b/Markdown/Markdown/Program.cs index 8ee79ea69..aae97738e 100644 --- a/Markdown/Markdown/Program.cs +++ b/Markdown/Markdown/Program.cs @@ -1,26 +1,4 @@ using Markdown; -using Markdown.Parser; -using Markdown.ParseTree; -using Markdown.SyntaxRules; -using Markdown.Token; -using Markdown.Tokenizer; -var tokenAliases = new Dictionary(); -tokenAliases.Add("_", MdTokenType.Italic); -tokenAliases.Add("__", MdTokenType.Bold); -tokenAliases.Add("# ", MdTokenType.Heading); -tokenAliases.Add("\n", MdTokenType.Line); - -var tokenTags = new Dictionary(); -tokenTags.Add(MdTokenType.Italic, "em"); -tokenTags.Add(MdTokenType.Bold, "strong"); -tokenTags.Add(MdTokenType.Heading, "h1"); - -var syntaxRules = new List>(); -syntaxRules.Add(new NestingRule()); -syntaxRules.Add(new NumberRule()); -syntaxRules.Add(new TokensInDifferentWordsRule()); - -var md = new Md( - tokenTags, new MdTokenizer(tokenAliases, '\\'), new MdParser(new MdParseTree()), syntaxRules.ToArray()); +var md = DefaultMdFactory.CreateMd(); Console.WriteLine(md.Render("# Hello World! _some words_ in italics\n__some other text__")); From 75251adc8afd06946e4ae4afe687c1aa20592524 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Wed, 11 Dec 2024 23:26:21 +0500 Subject: [PATCH 23/26] refactored MdTokenizer --- Markdown/Markdown/Tokenizer/MdTokenizer.cs | 57 +++++++++++++--------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/Markdown/Markdown/Tokenizer/MdTokenizer.cs b/Markdown/Markdown/Tokenizer/MdTokenizer.cs index 5c3e823fd..a6f795b8c 100644 --- a/Markdown/Markdown/Tokenizer/MdTokenizer.cs +++ b/Markdown/Markdown/Tokenizer/MdTokenizer.cs @@ -3,8 +3,25 @@ namespace Markdown.Tokenizer; -public class MdTokenizer(Dictionary tokenAliases, char escapeCharacter) : ITokenizer +public class MdTokenizer( + Dictionary tokenAliases, + char escapeCharacter, + char[] wordDelimiters) : ITokenizer { + private class TokenInfo + { + public TokenInfo(MdTokenType tokenType, string tokenAlias, MdTokenBehaviour tokenBehaviour) + { + TokenType = tokenType; + TokenAlias = tokenAlias; + TokenBehaviour = tokenBehaviour; + } + + public MdTokenType TokenType { get; set; } + public string TokenAlias { get; set; } + public MdTokenBehaviour TokenBehaviour { get; set; } + } + public IEnumerable Tokenize(ReadOnlyMemory input) { ArgumentExceptionHelpers.ThrowIfFalse( @@ -18,7 +35,7 @@ public IEnumerable Tokenize(ReadOnlyMemory input) { if (escapeCharacter == str![i] && i + 1 < str.Length) { - if (TryMatchTokenAliases(str, i + 1, out _, out _, out _)) + if (TryMatchTokenAliases(str, i + 1, out _)) { increment = 2; yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, @@ -27,15 +44,16 @@ public IEnumerable Tokenize(ReadOnlyMemory input) foundPlainText = false; } } - else if (TryMatchTokenAliases(str, i, out var tokenType, out var tokenAlias, out var tokenBehaviour)) + else if (TryMatchTokenAliases(str, i, out var tokenInfo)) { - increment = tokenAlias.Length; + increment = tokenInfo.TokenAlias.Length; if (foundPlainText) yield return new MdToken(MdTokenType.PlainText, MdTokenBehaviour.Undefined, input.Slice(plainTextStart, i - plainTextStart)); - yield return new MdToken(tokenType, tokenBehaviour, input.Slice(i, tokenAlias.Length)); + yield return new MdToken( + tokenInfo.TokenType, tokenInfo.TokenBehaviour, input.Slice(i, tokenInfo.TokenAlias.Length)); foundPlainText = false; } @@ -59,9 +77,7 @@ public IEnumerable Tokenize(ReadOnlyMemory input) private bool TryMatchTokenAliases( string input, int index, - out MdTokenType mdTokenType, - out string tokenAlias, - out MdTokenBehaviour tokenBehaviour) + out TokenInfo tokenInfo) { var matchedClosingToken = false; var mathcedOpeningToken = false; @@ -80,35 +96,29 @@ private bool TryMatchTokenAliases( var (alias, type, behaviour) = openingTokenAlias.Length > closingTokenAlias.Length ? (openingTokenAlias, openingTokenType, MdTokenBehaviour.Opening) : (closingTokenAlias, closingTokenType, MdTokenBehaviour.Closing); - + + MdTokenBehaviour tokenBehaviour; if (IsInsideAWord(input, index, alias)) tokenBehaviour = MdTokenBehaviour.InsideAWord; else tokenBehaviour = behaviour; - tokenAlias = alias; - mdTokenType = type; + tokenInfo = new TokenInfo(type, alias, tokenBehaviour); return true; } - + if (mathcedOpeningToken) { - tokenBehaviour = MdTokenBehaviour.Opening; - tokenAlias = openingTokenAlias; - mdTokenType = openingTokenType; + tokenInfo = new TokenInfo(openingTokenType, openingTokenAlias, MdTokenBehaviour.Opening); return true; } if (matchedClosingToken) { - tokenBehaviour = MdTokenBehaviour.Closing; - tokenAlias = closingTokenAlias; - mdTokenType = closingTokenType; + tokenInfo = new TokenInfo(closingTokenType, closingTokenAlias, MdTokenBehaviour.Closing); return true; } - - tokenBehaviour = MdTokenBehaviour.Undefined; - tokenAlias = String.Empty; - mdTokenType = MdTokenType.PlainText; + + tokenInfo = null; return false; } @@ -136,8 +146,7 @@ private bool TryMatchAlias(string input, int index, string alias, bool isEndToke private bool IsWordDelimiter(char c) { - return c is ' ' or '\t' or '\n' or '\r' or ',' or '.' - or '!' or '?'; + return wordDelimiters.Contains(c); } private bool IsInsideAWord(string input, int index, string alias) From 262d9a25ec757b263e6c6aeab09c8aae1ea11ac8 Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Wed, 11 Dec 2024 23:41:04 +0500 Subject: [PATCH 24/26] refactored syntax rules --- Markdown/Markdown/DefaultMdFactory.cs | 8 ++--- .../Markdown/SyntaxRules/MdValidationRule.cs | 33 +++++++++++++++++++ Markdown/Markdown/SyntaxRules/NestingRule.cs | 25 ++------------ Markdown/Markdown/SyntaxRules/NumberRule.cs | 29 +++------------- .../SyntaxRules/TokensInDifferentWordsRule.cs | 30 +++-------------- 5 files changed, 50 insertions(+), 75 deletions(-) create mode 100644 Markdown/Markdown/SyntaxRules/MdValidationRule.cs diff --git a/Markdown/Markdown/DefaultMdFactory.cs b/Markdown/Markdown/DefaultMdFactory.cs index 817dbed58..be5495e05 100644 --- a/Markdown/Markdown/DefaultMdFactory.cs +++ b/Markdown/Markdown/DefaultMdFactory.cs @@ -10,7 +10,7 @@ public static class DefaultMdFactory { private static readonly char[] _delimiters = { ' ', '\t', '\n', '\r', ',', '.', '!', '?' }; - private static readonly Dictionary _tokenAliases = new Dictionary + private static readonly Dictionary _tokenAliases = new() { { "_", MdTokenType.Italic }, { "__", MdTokenType.Bold }, @@ -18,18 +18,18 @@ public static class DefaultMdFactory { "\n", MdTokenType.Line } }; - private static readonly Dictionary _tokenTags = new Dictionary + private static readonly Dictionary _tokenTags = new() { { MdTokenType.Italic, "em" }, { MdTokenType.Bold, "strong" }, { MdTokenType.Heading, "h1" } }; - private static readonly List> _syntaxRules = new List> + private static readonly List> _syntaxRules = new() { new NestingRule(), new NumberRule(), - new TokensInDifferentWordsRule() + new TokensInDifferentWordsRule(_delimiters) }; public static Md CreateMd() diff --git a/Markdown/Markdown/SyntaxRules/MdValidationRule.cs b/Markdown/Markdown/SyntaxRules/MdValidationRule.cs new file mode 100644 index 000000000..71f34bb4a --- /dev/null +++ b/Markdown/Markdown/SyntaxRules/MdValidationRule.cs @@ -0,0 +1,33 @@ +using Markdown.NodeView; +using Markdown.Token; + +namespace Markdown.SyntaxRules; + +public abstract class MdValidationRule : ISyntaxRule +{ + protected abstract bool CheckNode(INodeView currentNode, INodeView parentNode); + + public INodeView Apply(INodeView nodeView) + { + for (var i = 0; i < nodeView.Children.Count; i++) + { + var childNode = nodeView.Children[i]; + if (CheckNode(childNode, nodeView)) + { + childNode.Type = MdTokenType.PlainText; + foreach (var toMove in childNode.Children.AsEnumerable().Reverse()) + { + nodeView.Children.Insert(i + 1, toMove); + toMove.Parent = nodeView; + } + + nodeView.Children.Insert(i + 1 + childNode.Children.Count, childNode); + childNode.Children.Clear(); + } + + Apply(childNode); + } + + return nodeView; + } +} \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/NestingRule.cs b/Markdown/Markdown/SyntaxRules/NestingRule.cs index 0b5ee941b..a7199d28d 100644 --- a/Markdown/Markdown/SyntaxRules/NestingRule.cs +++ b/Markdown/Markdown/SyntaxRules/NestingRule.cs @@ -3,29 +3,10 @@ namespace Markdown.SyntaxRules; -public class NestingRule : ISyntaxRule +public class NestingRule : MdValidationRule { - public INodeView Apply(INodeView nodeView) + protected override bool CheckNode(INodeView currentNode, INodeView parentNode) { - for (var i = 0; i < nodeView.Children.Count; i++) - { - var childNode = nodeView.Children[i]; - if (childNode.Type == MdTokenType.Bold && nodeView.Type == MdTokenType.Italic) - { - childNode.Type = MdTokenType.PlainText; - foreach (var toMove in childNode.Children.AsEnumerable().Reverse()) - { - nodeView.Children.Insert(i + 1, toMove); - toMove.Parent = nodeView; - } - - nodeView.Children.Insert(i + 1 + childNode.Children.Count, childNode); - childNode.Children.Clear(); - } - - Apply(childNode); - } - - return nodeView; + return currentNode.Type == MdTokenType.Bold && parentNode.Type == MdTokenType.Italic; } } \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/NumberRule.cs b/Markdown/Markdown/SyntaxRules/NumberRule.cs index 2bdda018e..bdb89e726 100644 --- a/Markdown/Markdown/SyntaxRules/NumberRule.cs +++ b/Markdown/Markdown/SyntaxRules/NumberRule.cs @@ -4,31 +4,12 @@ namespace Markdown.SyntaxRules; -public class NumberRule : ISyntaxRule +public class NumberRule : MdValidationRule { - public INodeView Apply(INodeView nodeView) + protected override bool CheckNode(INodeView currentNode, INodeView parentNode) { - for (var i = 0; i < nodeView.Children.Count; i++) - { - var childNode = nodeView.Children[i]; - if (childNode is { InsideWord: true, Type: MdTokenType.Bold or MdTokenType.Italic } - && childNode.Children.Any( - n => n.Text.ContainsNumber())) - { - childNode.Type = MdTokenType.PlainText; - foreach (var toMove in childNode.Children.AsEnumerable().Reverse()) - { - nodeView.Children.Insert(i + 1, toMove); - toMove.Parent = nodeView; - } - - nodeView.Children.Insert(i + 1 + childNode.Children.Count, childNode); - childNode.Children.Clear(); - } - - Apply(childNode); - } - - return nodeView; + return currentNode is { InsideWord: true, Type: MdTokenType.Bold or MdTokenType.Italic } + && currentNode.Children.Any( + n => n.Text.ContainsNumber()); } } \ No newline at end of file diff --git a/Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs b/Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs index d748a2809..d6a7c94b2 100644 --- a/Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs +++ b/Markdown/Markdown/SyntaxRules/TokensInDifferentWordsRule.cs @@ -4,32 +4,12 @@ namespace Markdown.SyntaxRules; -public class TokensInDifferentWordsRule : ISyntaxRule +public class TokensInDifferentWordsRule(char[] delimiters) : MdValidationRule { - private static readonly char[] Delimiters = new[] { ' ', '\t', '\r', '\n', ',', '.', '!', '?' }; - public INodeView Apply(INodeView nodeView) + protected override bool CheckNode(INodeView currentNode, INodeView parentNode) { - for (var i = 0; i < nodeView.Children.Count; i++) - { - var childNode = nodeView.Children[i]; - if (childNode is { InsideWord: true, Type: MdTokenType.Bold or MdTokenType.Italic } - && childNode.Children.Any( - n => Delimiters.Any(x => n.Text.Contains(x)))) - { - childNode.Type = MdTokenType.PlainText; - foreach (var toMove in childNode.Children.AsEnumerable().Reverse()) - { - nodeView.Children.Insert(i + 1, toMove); - toMove.Parent = nodeView; - } - - nodeView.Children.Insert(i + 1 + childNode.Children.Count, childNode); - childNode.Children.Clear(); - } - - Apply(childNode); - } - - return nodeView; + return currentNode is { InsideWord: true, Type: MdTokenType.Bold or MdTokenType.Italic } + && currentNode.Children.Any( + n => delimiters.Any(x => n.Text.Contains(x))); } } \ No newline at end of file From f7be9646e2a2980d7c7dc653ef354a572988e12f Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 14 Dec 2024 12:57:50 +0500 Subject: [PATCH 25/26] moved private fields declarations --- Markdown/Markdown/ParseTree/MdParseTree.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Markdown/Markdown/ParseTree/MdParseTree.cs b/Markdown/Markdown/ParseTree/MdParseTree.cs index dff46b1df..7a54402c4 100644 --- a/Markdown/Markdown/ParseTree/MdParseTree.cs +++ b/Markdown/Markdown/ParseTree/MdParseTree.cs @@ -29,6 +29,9 @@ public Node(MdTokenType type, public List Children { get; set; } public Node? Parent { get; set; } } + + private readonly Node _root; + private Node _current; public MdParseTree() { @@ -36,9 +39,6 @@ public MdParseTree() _current = _root; } - private readonly Node _root; - private Node _current; - public ParseTreeNodeView CurrentToken => new(_current.Text, _current.Type, _current.Children.Count == 0, _current.Complete, _current.InsideWord); From 96b27965a79f9892aea3f7fb5efbfc7f11e8134c Mon Sep 17 00:00:00 2001 From: Nikita Shevyrin Date: Sat, 14 Dec 2024 14:34:09 +0500 Subject: [PATCH 26/26] moved performance tests to a different project and added more test cases --- .../Markdown.PerformanceTests.csproj | 21 +++++++ .../MdPerformanceTests.cs | 55 +++++++++++++++++++ Markdown/Markdown.Tests/MdTests.cs | 41 -------------- Markdown/Markdown.sln | 6 ++ Markdown/Markdown/PerformanceMeasurer.cs | 23 ++++++++ 5 files changed, 105 insertions(+), 41 deletions(-) create mode 100644 Markdown/Markdown.PerformanceTests/Markdown.PerformanceTests.csproj create mode 100644 Markdown/Markdown.PerformanceTests/MdPerformanceTests.cs create mode 100644 Markdown/Markdown/PerformanceMeasurer.cs diff --git a/Markdown/Markdown.PerformanceTests/Markdown.PerformanceTests.csproj b/Markdown/Markdown.PerformanceTests/Markdown.PerformanceTests.csproj new file mode 100644 index 000000000..f3221b756 --- /dev/null +++ b/Markdown/Markdown.PerformanceTests/Markdown.PerformanceTests.csproj @@ -0,0 +1,21 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + + + + + + + diff --git a/Markdown/Markdown.PerformanceTests/MdPerformanceTests.cs b/Markdown/Markdown.PerformanceTests/MdPerformanceTests.cs new file mode 100644 index 000000000..afb11db86 --- /dev/null +++ b/Markdown/Markdown.PerformanceTests/MdPerformanceTests.cs @@ -0,0 +1,55 @@ +using System.Text; +using FluentAssertions; +using NUnit.Framework; + +namespace Markdown.PerformanceTests; + +[TestFixture] +public class MdPerformanceTests +{ + private IRenderer _mdRenderer; + private PerformanceMeasurer _measurer; + + [SetUp] + public void SetUp() + { + _mdRenderer = DefaultMdFactory.CreateMd(); + _measurer = new PerformanceMeasurer(Console.WriteLine); + } + + [Test] + [TestCase("Hello _world_!\n", 18000)] + [TestCase("# _Hello_\n __world__!\n", 9000)] + [TestCase("This __text _contains_ nested__ markdown\n", 10000)] + [TestCase("This is _an example __of inversed__ nested_ markdown\n", 8000)] + [TestCase("Text_12_3\n", 15000)] + [TestCase("Text __that_12_3__ is in bold\n", 10000)] + [TestCase("_begin_ning\n", 20000)] + [TestCase("end_ing_\n", 20000)] + [TestCase("mi__ddl__e\n", 15000)] + [TestCase("This sh_ould not cha_nge\n", 10000)] + [TestCase("This sh__o_uld_ wo__rk like this\n", 9000)] + [TestCase("__Unpaired_ markdown\n", 20000)] + [TestCase("Another _unpaired markdown__\n", 18000)] + [TestCase("Intersecting _markdown __should_ work__ like this\n", 10000)] + [TestCase("This should ____ remain the same\n", 15000)] + [TestCase(@"This should \_not turn\_ into tags", 20000)] + [TestCase(@"This should \remain the\ same", 20000)] + public void PerformanceTest(string testInput, int stringRepetitions) + { + var str = ArrangePerformanceTest(testInput, stringRepetitions); + Console.WriteLine($"Total length: {str.Length}"); + + _measurer.MeasureAverageTime(() => _mdRenderer.Render(str), 10) + .Should() + .BeLessOrEqualTo(1000); + } + + private string ArrangePerformanceTest(string input, int copyCount) + { + var sb = new StringBuilder(); + for (var i = 0; i < copyCount; i++) + sb.Append(input); + return sb.ToString(); + } +} \ No newline at end of file diff --git a/Markdown/Markdown.Tests/MdTests.cs b/Markdown/Markdown.Tests/MdTests.cs index cbf76b32d..9d9a4a8cd 100644 --- a/Markdown/Markdown.Tests/MdTests.cs +++ b/Markdown/Markdown.Tests/MdTests.cs @@ -1,13 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Text; using FluentAssertions; -using Markdown.Parser; -using Markdown.ParseTree; -using Markdown.SyntaxRules; -using Markdown.Token; -using Markdown.Tokenizer; using NUnit.Framework; namespace Markdown.Tests; @@ -161,36 +152,4 @@ public void Render_ReturnsCorrectMarkdown_ForEscapeCharacters( .Should() .Be(expectedOutput); } - - [Test] - [Description("Тест на производительность")] - public void Render_PerformanceTest() - { - var fullStr = ArrangePerformanceTest("_Hello_ world_12. Hel_lo world_", 20000); - Console.WriteLine($"Total length: {fullStr.Length}"); - - var totalTime = MeasureTime(fullStr); - Console.WriteLine($"Time elapsed in ms: {totalTime}"); - - totalTime - .Should() - .BeLessThan(1000); - } - - private long MeasureTime(string fullStr) - { - var sw = new Stopwatch(); - sw.Start(); - _md.Render(fullStr); - sw.Stop(); - return sw.ElapsedMilliseconds; - } - - private string ArrangePerformanceTest(string input, int copyCount) - { - var sb = new StringBuilder(); - for (var i = 0; i < copyCount; i++) - sb.Append(input); - return sb.ToString(); - } } \ No newline at end of file diff --git a/Markdown/Markdown.sln b/Markdown/Markdown.sln index bd31bed26..e1ba4dda3 100644 --- a/Markdown/Markdown.sln +++ b/Markdown/Markdown.sln @@ -4,6 +4,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown", "Markdown\Markdo EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown.Tests", "Markdown.Tests\Markdown.Tests.csproj", "{0B1D2315-E457-4F38-92C9-5BC11A8752B6}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Markdown.PerformanceTests", "Markdown.PerformanceTests\Markdown.PerformanceTests.csproj", "{C030F3F2-BED4-42E7-830A-63B3A7541B4C}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -18,5 +20,9 @@ Global {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Debug|Any CPU.Build.0 = Debug|Any CPU {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Release|Any CPU.ActiveCfg = Release|Any CPU {0B1D2315-E457-4F38-92C9-5BC11A8752B6}.Release|Any CPU.Build.0 = Release|Any CPU + {C030F3F2-BED4-42E7-830A-63B3A7541B4C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C030F3F2-BED4-42E7-830A-63B3A7541B4C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C030F3F2-BED4-42E7-830A-63B3A7541B4C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C030F3F2-BED4-42E7-830A-63B3A7541B4C}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/Markdown/Markdown/PerformanceMeasurer.cs b/Markdown/Markdown/PerformanceMeasurer.cs new file mode 100644 index 000000000..a609a52d4 --- /dev/null +++ b/Markdown/Markdown/PerformanceMeasurer.cs @@ -0,0 +1,23 @@ +using System.Diagnostics; + +namespace Markdown; + +public class PerformanceMeasurer(Action logAction) +{ + public long MeasureAverageTime(Action action, int times) + { + var measures = new List(); + var stopwatch = new Stopwatch(); + for (var i = 0; i < times; i++) + { + stopwatch.Start(); + action(); + stopwatch.Stop(); + measures.Add(stopwatch.ElapsedMilliseconds); + stopwatch.Reset(); + } + var time = (long)Math.Round(measures.Average()); + logAction($"Average time in ms: {time}"); + return time; + } +} \ No newline at end of file