From 6f02ada4b9de454b1386f0bc1dab3864373868e4 Mon Sep 17 00:00:00 2001 From: "kashin.aleksandr" Date: Wed, 11 Dec 2024 17:42:19 +0500 Subject: [PATCH] Refactor tokenizer --- cs/Markdown/MarkdownRenderer.cs | 9 +- .../Tests/Tokenizer/BoldHandlerTests.cs | 4 +- .../Tests/Tokenizer/HeaderHandlerTests.cs | 4 +- .../Tests/Tokenizer/ItalicHandlerTests.cs | 4 +- cs/Markdown/Tokenizer/HandlerManager.cs | 33 ++++ cs/Markdown/Tokenizer/IHandlerManager.cs | 9 + cs/Markdown/Tokenizer/ITagProcessor.cs | 8 + cs/Markdown/Tokenizer/MarkdownTokenizer.cs | 184 +++--------------- cs/Markdown/Tokenizer/TagProcessor.cs | 114 +++++++++++ 9 files changed, 209 insertions(+), 160 deletions(-) create mode 100644 cs/Markdown/Tokenizer/HandlerManager.cs create mode 100644 cs/Markdown/Tokenizer/IHandlerManager.cs create mode 100644 cs/Markdown/Tokenizer/ITagProcessor.cs create mode 100644 cs/Markdown/Tokenizer/TagProcessor.cs diff --git a/cs/Markdown/MarkdownRenderer.cs b/cs/Markdown/MarkdownRenderer.cs index 0006e5ff3..1327312d8 100644 --- a/cs/Markdown/MarkdownRenderer.cs +++ b/cs/Markdown/MarkdownRenderer.cs @@ -1,5 +1,6 @@ using Markdown.Render; using Markdown.Tokenizer; +using Markdown.Tokenizer.Handlers; using Markdown.Tokenizer.Nodes; using Markdown.Tokenizer.Tags; @@ -7,9 +8,15 @@ namespace Markdown; public class MarkdownRenderer : IMarkdown { + private readonly List handlers = new() + { + new HeaderHandler(), + new ItalicHandler(), + new BoldHandler(), + }; public string Render(string markdown) { - var tokenizer = new MarkdownTokenizer(); + var tokenizer = new MarkdownTokenizer(new HandlerManager(handlers), new TagProcessor()); var renderer = new HtmlRenderer(); var tokens = tokenizer.Tokenize(markdown); var tree = ToTree(tokens); diff --git a/cs/Markdown/Tests/Tokenizer/BoldHandlerTests.cs b/cs/Markdown/Tests/Tokenizer/BoldHandlerTests.cs index 1ceff7c41..a1519b5a0 100644 --- a/cs/Markdown/Tests/Tokenizer/BoldHandlerTests.cs +++ b/cs/Markdown/Tests/Tokenizer/BoldHandlerTests.cs @@ -1,5 +1,6 @@ using FluentAssertions; using Markdown.Tokenizer; +using Markdown.Tokenizer.Handlers; using Markdown.Tokenizer.Tags; namespace Markdown.Tests.Tokenizer; @@ -10,7 +11,8 @@ public class BoldHandlerTests [TestCaseSource(nameof(BoldTokenSource))] public void BoldTokenizerTests((string input, Token[] tags) testCase) { - var tokenizer = new MarkdownTokenizer(); + var handlers = new List() { new HeaderHandler(), new ItalicHandler(), new BoldHandler() }; + var tokenizer = new MarkdownTokenizer(new HandlerManager(handlers), new TagProcessor()); var res = tokenizer.Tokenize(testCase.input).ToArray(); for (var i = 0; i < testCase.tags.Length; i++) diff --git a/cs/Markdown/Tests/Tokenizer/HeaderHandlerTests.cs b/cs/Markdown/Tests/Tokenizer/HeaderHandlerTests.cs index 2fc9cce02..5f81ce304 100644 --- a/cs/Markdown/Tests/Tokenizer/HeaderHandlerTests.cs +++ b/cs/Markdown/Tests/Tokenizer/HeaderHandlerTests.cs @@ -1,5 +1,6 @@ using FluentAssertions; using Markdown.Tokenizer; +using Markdown.Tokenizer.Handlers; using Markdown.Tokenizer.Tags; namespace Markdown.Tests.Tokenizer; @@ -10,7 +11,8 @@ public class HeaderHandlerTests [TestCaseSource(nameof(HeaderTokenSource))] public void HeaderTokenizerTests((string input, Token[] tags) testCase) { - var tokenizer = new MarkdownTokenizer(); + var handlers = new List() { new HeaderHandler(), new ItalicHandler(), new BoldHandler() }; + var tokenizer = new MarkdownTokenizer(new HandlerManager(handlers), new TagProcessor()); var res = tokenizer.Tokenize(testCase.input).ToArray(); for (var i = 0; i < testCase.tags.Length; i++) diff --git a/cs/Markdown/Tests/Tokenizer/ItalicHandlerTests.cs b/cs/Markdown/Tests/Tokenizer/ItalicHandlerTests.cs index a07f4f2fd..c2b68e3c5 100644 --- a/cs/Markdown/Tests/Tokenizer/ItalicHandlerTests.cs +++ b/cs/Markdown/Tests/Tokenizer/ItalicHandlerTests.cs @@ -1,5 +1,6 @@ using FluentAssertions; using Markdown.Tokenizer; +using Markdown.Tokenizer.Handlers; using Markdown.Tokenizer.Tags; @@ -11,7 +12,8 @@ public class ItalicParserTests [TestCaseSource(nameof(ItalicTokenSource))] public void ItalicTokenizerTests((string input, Token[] tags) testCase) { - var tokenizer = new MarkdownTokenizer(); + var handlers = new List() { new HeaderHandler(), new ItalicHandler(), new BoldHandler() }; + var tokenizer = new MarkdownTokenizer(new HandlerManager(handlers), new TagProcessor()); var res = tokenizer.Tokenize(testCase.input).ToArray(); for (var i = 0; i < testCase.tags.Length; i++) diff --git a/cs/Markdown/Tokenizer/HandlerManager.cs b/cs/Markdown/Tokenizer/HandlerManager.cs new file mode 100644 index 000000000..acc5f8969 --- /dev/null +++ b/cs/Markdown/Tokenizer/HandlerManager.cs @@ -0,0 +1,33 @@ +using System.Text; +using Markdown.Tokenizer.Handlers; +using Markdown.Tokenizer.Tags; + +namespace Markdown.Tokenizer; + +public class HandlerManager(IEnumerable handlers) : IHandlerManager +{ + private readonly List handlers = handlers.ToList(); + + public void TryHandle(TokenizerContext context, StringBuilder buffer, List tags, Stack tagStack) + { + foreach (var handler in handlers) + { + var tag = handler.ProceedSymbol(context); + if (tag != null) + { + if (buffer.Length > 0) + { + var token = new TextToken(buffer.ToString()); + tags.Add(token); + buffer.Clear(); + } + + tags.Add(tag); + tagStack.Push(tag); + return; + } + } + + buffer.Append(context.Current); + } +} \ No newline at end of file diff --git a/cs/Markdown/Tokenizer/IHandlerManager.cs b/cs/Markdown/Tokenizer/IHandlerManager.cs new file mode 100644 index 000000000..accf44f2c --- /dev/null +++ b/cs/Markdown/Tokenizer/IHandlerManager.cs @@ -0,0 +1,9 @@ +using System.Text; +using Markdown.Tokenizer.Tags; + +namespace Markdown.Tokenizer; + +public interface IHandlerManager +{ + void TryHandle(TokenizerContext context, StringBuilder buffer, List tags, Stack tagStack); +} \ No newline at end of file diff --git a/cs/Markdown/Tokenizer/ITagProcessor.cs b/cs/Markdown/Tokenizer/ITagProcessor.cs new file mode 100644 index 000000000..7dd70d539 --- /dev/null +++ b/cs/Markdown/Tokenizer/ITagProcessor.cs @@ -0,0 +1,8 @@ +using Markdown.Tokenizer.Tags; + +namespace Markdown.Tokenizer; + +public interface ITagProcessor +{ + void Process(List tags, Stack tagStack); +} \ No newline at end of file diff --git a/cs/Markdown/Tokenizer/MarkdownTokenizer.cs b/cs/Markdown/Tokenizer/MarkdownTokenizer.cs index feba3617f..1d30c8779 100644 --- a/cs/Markdown/Tokenizer/MarkdownTokenizer.cs +++ b/cs/Markdown/Tokenizer/MarkdownTokenizer.cs @@ -5,192 +5,64 @@ namespace Markdown.Tokenizer; -public class MarkdownTokenizer : ITokenizer +public class MarkdownTokenizer(IHandlerManager handlerManager, ITagProcessor tagProcessor) : ITokenizer { private readonly StringBuilder buffer = new(); private List tags = new(); private readonly Stack tagStack = new(); - private readonly List handlers = new() - { - new HeaderHandler(), - new ItalicHandler(), - new BoldHandler(), - }; public List Tokenize(string text) { var context = new TokenizerContext(text); while (!context.IsEnd) { - if (context.Current == '\n') - { - FlushBuffer(); - var token = new NewLineToken(); - tags.Add(token); - context.Advance(); - continue; - } - if (context.Current == ' ') - { - if (buffer.Length > 0) - { - tags.Add(new TextToken(buffer.ToString())); - buffer.Clear(); - } - buffer.Append(context.Current); - context.Advance(); - continue; - } - if (context.Current == '\\') - { - FlushBuffer(); + if(TryProceedSpecialSymbol(context)) continue; - tags.Add(new SlashToken()); - context.Advance(); - continue; - } - - bool flag = false; - foreach (var handler in handlers) - { - var tag = handler.ProceedSymbol(context); - if (tag != null) - { - if (buffer.Length > 0) - { - var token = new TextToken(buffer.ToString()); - tags.Add(token); - buffer.Clear(); - } - - tags.Add(tag); - tagStack.Push(tag); - flag = true; - break; - } - } - - if (flag == false) - { - buffer.Append(context.Current); - } + handlerManager.TryHandle(context, buffer, tags, tagStack); context.Advance(); } FlushBuffer(); - ProceedEscaped(); - ProceedInWords(); - ProceedTags(); + + tagProcessor.Process(tags, tagStack); + return tags; } - private void ProceedInWords() + private bool TryProceedSpecialSymbol(TokenizerContext context) { - for (var i = 0; i < tags.Count; i++) + switch (context.Current) { - var current = tags[i]; - if (current.TagStatus == TagStatus.InWord) + case '\n': { - if (i - 2 >= 0) - { - if (tags[i - 1].TokenType == TokenType.String - && tags[i - 2].TagStatus == TagStatus.Open) - { - current.TagStatus = TagStatus.Closed; - } - } + FlushBuffer(); + var token = new NewLineToken(); + tags.Add(token); + context.Advance(); - if (i + 2 < tags.Count) - { - if (tags[i + 1].TokenType == TokenType.String) - { - if (tags[i + 2].TagStatus == TagStatus.Closed) - { - current.TagStatus = TagStatus.Open; - } - else if (tags[i + 2].TagStatus == TagStatus.InWord) - { - current.TagStatus = TagStatus.Open; - tags[i + 2].TagStatus = TagStatus.Closed; - } - } - } + return true; } - } - } - - private void ProceedEscaped() - { - for (var i = 0; i < tags.Count - 1; i++) - { - var current = tags[i]; - var next = tags[i + 1]; - if (current.TokenType is TokenType.Slash && current.TagStatus != TagStatus.Broken) + case ' ': { - if (next is { TokenType: TokenType.Slash }) - { - current.TagStatus = TagStatus.Escaped; - next.TagStatus = TagStatus.Broken; - } - else if (next is { TagStatus: TagStatus.Open or TagStatus.Closed or TagStatus.Single }) + if (buffer.Length > 0) { - next.TagStatus = TagStatus.Broken; - current.TagStatus = TagStatus.Escaped; + tags.Add(new TextToken(buffer.ToString())); + buffer.Clear(); } - } - } - - tags = tags.Where(t => t.TagStatus != TagStatus.Escaped).ToList(); - } - - private void ProceedTags() - { - var tempStack = new Stack(); - - while (tagStack.Count > 0) - { - var current = tagStack.Pop(); - - if (current.TagStatus != TagStatus.Broken && current.TagStatus != TagStatus.Single) - { - if (tempStack.Count > 0) - { - var previousTag = tempStack.Peek(); + buffer.Append(context.Current); + context.Advance(); - if (previousTag.TokenType == current.TokenType) - { - if (previousTag.TagStatus == TagStatus.Closed && current.TagStatus == TagStatus.Open) - { - tempStack.Pop(); - } - else - { - tempStack.Push(current); - } - } - else - { - if (current.TokenType == TokenType.Bold && previousTag.TokenType == TokenType.Italic) - { - current.TagStatus = TagStatus.Broken; - } - else - { - tempStack.Push(current); - } - } - } - else - { - tempStack.Push(current); - } + return true; } - } + case '\\': + FlushBuffer(); + tags.Add(new SlashToken()); + context.Advance(); - while (tempStack.Count > 0) - { - tempStack.Pop().TagStatus = TagStatus.Broken; + return true; + default: + return false; } } diff --git a/cs/Markdown/Tokenizer/TagProcessor.cs b/cs/Markdown/Tokenizer/TagProcessor.cs new file mode 100644 index 000000000..8fe8747c4 --- /dev/null +++ b/cs/Markdown/Tokenizer/TagProcessor.cs @@ -0,0 +1,114 @@ +using Markdown.Tokenizer.Tags; + +namespace Markdown.Tokenizer; + +public class TagProcessor : ITagProcessor +{ + public void Process(List tags, Stack tagStack) + { + ProceedEscaped(tags); + ProceedInWords(tags); + ProceedTags(tagStack); + } + + private void ProceedInWords(List tags) + { + for (var i = 0; i < tags.Count; i++) + { + var current = tags[i]; + if (current.TagStatus == TagStatus.InWord) + { + if (i - 2 >= 0) + { + if (tags[i - 1].TokenType == TokenType.String + && tags[i - 2].TagStatus == TagStatus.Open) + { + current.TagStatus = TagStatus.Closed; + } + } + + if (i + 2 >= tags.Count) continue; + if (tags[i + 1].TokenType != TokenType.String) continue; + if (tags[i + 2].TagStatus == TagStatus.Closed) + { + current.TagStatus = TagStatus.Open; + } + else if (tags[i + 2].TagStatus == TagStatus.InWord) + { + current.TagStatus = TagStatus.Open; + tags[i + 2].TagStatus = TagStatus.Closed; + } + } + } + } + + private void ProceedEscaped(List tags) + { + for (var i = 0; i < tags.Count - 1; i++) + { + var current = tags[i]; + var next = tags[i + 1]; + if (current.TokenType is TokenType.Slash && current.TagStatus != TagStatus.Broken) + { + if (next is { TokenType: TokenType.Slash }) + { + current.TagStatus = TagStatus.Escaped; + next.TagStatus = TagStatus.Broken; + } + else if (next is { TagStatus: TagStatus.Open or TagStatus.Closed or TagStatus.Single }) + { + next.TagStatus = TagStatus.Broken; + current.TagStatus = TagStatus.Escaped; + } + } + } + } + + private void ProceedTags(Stack tagStack) + { + var tempStack = new Stack(); + + while (tagStack.Count > 0) + { + var current = tagStack.Pop(); + + if (current.TagStatus != TagStatus.Broken && current.TagStatus != TagStatus.Single) + { + if (tempStack.Count > 0) + { + var previousTag = tempStack.Peek(); + + if (previousTag.TokenType == current.TokenType) + { + if (previousTag.TagStatus == TagStatus.Closed && current.TagStatus == TagStatus.Open) + { + tempStack.Pop(); + } + else + { + tempStack.Push(current); + } + } + else + { + if (current.TokenType == TokenType.Bold && previousTag.TokenType == TokenType.Italic) + { + current.TagStatus = TagStatus.Broken; + } + else + { + tempStack.Push(current); + } + } + } + else + { + tempStack.Push(current); + } + } + } + + while (tempStack.Count > 0) + tempStack.Pop().TagStatus = TagStatus.Broken; + } +} \ No newline at end of file