diff --git a/.gihub/workflows/release.yml b/.gihub/workflows/release.yml
new file mode 100644
index 0000000..97df1af
--- /dev/null
+++ b/.gihub/workflows/release.yml
@@ -0,0 +1,15 @@
+on:
+ push:
+ branches:
+ - main
+
+jobs:
+ release-on-push:
+ permissions: write-all
+ runs-on: ubuntu-latest
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ steps:
+ - uses: rymndhng/release-on-push-action@master
+ with:
+ bump_version_scheme: minor
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..62c8935
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea/
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..b6d3e20
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,12 @@
+## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
+Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. Issue that pull request!
+
+## License
+By contributing, you agree that your contributions will be licensed under its MIT License.
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..b85cb2f
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2018 GitHub, Inc. and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7eab989
--- /dev/null
+++ b/README.md
@@ -0,0 +1,85 @@
+# Plain Text Extractor
+[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+[![Go Report Card](https://goreportcard.com/badge/github.com/huantt/plaintext-parser)](https://goreportcard.com/report/github.com/huantt/plaintext-parser)
+
+Plain Text Extractor is a Golang library that helps you extract plain text from `HTML` and `Markdown`.
+
+It provides a flexible and extensible interface for extracting the plain text content using both the predefined extraction methods and your own custom extraction requirements.
+
+## Features
+- Parse HTML and Markdown documents into plain text.
+- Support for custom extraction functions.
+- Easy-to-use API to convert complex documents to simple plain text.
+
+## Installation
+```shell
+go get github.com/huantt/plaintext-extractror
+```
+
+## Usage
+### Markdown extractor
+```go
+markdownContent := "# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~"
+extractor := NewMarkdownExtractor()
+output, err := extractor.PlainText(markdownContent)
+if err != nil {
+ panic(err)
+}
+fmt.Println(output)
+// Output: H1 \nitalic bold code `not code link image strikethrough
+```
+
+### Custom Markdown Tag
+```go
+markdownContent := "This is {color:#0A84FF}red{color}"
+
+customTag := markdown.Tag{
+ Name: "color-custom-tag",
+ FullRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}(.*?){color}"),
+ StartRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}"),
+ EndRegex: regexp.MustCompile("{color}"),
+}
+
+markdownExtractor := NewMarkdownExtractor(customTag)
+plaintextExtractor := plaintext.NewExtractor(markdownExtractor.PlainText)
+plaintext, err := plaintextExtractor.PlainText(markdownContent)
+if err != nil{
+ panic(nil)
+}
+fmt.Println(plaintext)
+// Output: This is red
+```
+
+### HTML Extractor
+```go
+html := `
`
+extractor := NewHtmlExtractor()
+output, err := extractor.PlainText(html)
+if err != nil {
+ panic(err)
+}
+fmt.Println(output)
+// Output: This is a link
+```
+
+### Multiple extractors
+```go
+input := ` html
*markdown*`
+markdownExtractor := markdown.NewExtractor()
+htmlExtractor := html.NewExtractor()
+extractor := NewExtractor(markdownExtractor.PlainText, htmlExtractor.PlainText)
+output, err := extractor.PlainText(input)
+if err != nil {
+ panic(err)
+}
+fmt.Println(output)
+// Output: html markdown
+```
+
+## Contribution
+Contributions to the Plain Text Parser project are welcome!
+If you find any issues or want to add new features, please feel free to open an issue or submit a pull request.
+Please see the [CONTRIBUTING.md](./CONTRIBUTING.md) for more information.
+
+## License
+This project released under the MIT License, refer [LICENSE](./LICENSE.md) file.
\ No newline at end of file
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..58a980f
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,14 @@
+module github.com/huantt/plaintext-extractor
+
+go 1.18
+
+require (
+ github.com/stretchr/testify v1.8.0
+ golang.org/x/net v0.12.0
+)
+
+require (
+ github.com/davecgh/go-spew v1.1.1 // indirect
+ github.com/pmezard/go-difflib v1.0.0 // indirect
+ gopkg.in/yaml.v3 v3.0.1 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..fc6abb3
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,17 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50=
+golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/html/.keep b/html/.keep
new file mode 100644
index 0000000..e69de29
diff --git a/html/extractor.go b/html/extractor.go
new file mode 100644
index 0000000..ebd87f2
--- /dev/null
+++ b/html/extractor.go
@@ -0,0 +1,65 @@
+package html
+
+import (
+ "golang.org/x/net/html"
+ "strings"
+)
+
+// HTMLExtractor represents an HTML-specific plain text extractor.
+type Extractor struct {
+ blockTags map[string]bool
+}
+
+// NewExtractor creates a new HTMLExtractor instance.
+func NewExtractor(otherBlockTags ...string) *Extractor {
+ uniqueBlockTags := map[string]bool{}
+ for _, tag := range blockTags {
+ uniqueBlockTags[tag] = true
+ }
+ for _, tag := range otherBlockTags {
+ uniqueBlockTags[tag] = true
+ }
+
+ return &Extractor{blockTags: uniqueBlockTags}
+}
+
+// PlainText extracts plain text from the input HTML string.
+func (e *Extractor) PlainText(input string) (*string, error) {
+ doc, err := html.Parse(strings.NewReader(input))
+ if err != nil {
+ return nil, err
+ }
+
+ var plainText strings.Builder
+ e.extractText(&plainText, doc)
+
+ output := plainText.String()
+ output = strings.ReplaceAll(output, "\n ", "\n")
+ return &output, nil
+}
+
+// Recursively extract plain text from the HTML nodes.
+func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
+ if node.Type == html.TextNode {
+ // Trim and append the text content
+ text := strings.TrimSpace(node.Data)
+ if text != "" {
+ if plainText.Len() > 0 {
+ if found := e.blockTags[node.Parent.DataAtom.String()]; found {
+ plainText.WriteString("\n")
+ } else {
+ plainText.WriteString(" ")
+ }
+ }
+ plainText.WriteString(text)
+ }
+ }
+ if node.DataAtom.String() == "br" {
+ plainText.WriteString("\n")
+ return
+ }
+
+ for child := node.FirstChild; child != nil; child = child.NextSibling {
+ e.extractText(plainText, child)
+ }
+}
diff --git a/html/extractor_test.go b/html/extractor_test.go
new file mode 100644
index 0000000..98fb648
--- /dev/null
+++ b/html/extractor_test.go
@@ -0,0 +1,26 @@
+package html
+
+import (
+ "github.com/stretchr/testify/assert"
+ "testing"
+)
+
+func TestExtract(t *testing.T) {
+ extractor := NewExtractor()
+ tests := []struct {
+ input string
+ expected string
+ }{
+ {`a
b`, "a\nb"},
+ {`a
b
`, "a\n\nb"},
+ {`link`, "link"},
+ {``, "This is a link"},
+ {"", "Heading 1\nHeading 2\nItem 1\nItem 2"},
+ }
+ for _, test := range tests {
+ output, err := extractor.PlainText(test.input)
+ assert.NoError(t, err)
+ assert.NotNil(t, output)
+ assert.Equal(t, test.expected, *output)
+ }
+}
diff --git a/html/html.go b/html/html.go
new file mode 100644
index 0000000..1494dd6
--- /dev/null
+++ b/html/html.go
@@ -0,0 +1,38 @@
+package html
+
+var blockTags = []string{
+ "address",
+ "article",
+ "aside",
+ "blockquote",
+ "canvas",
+ "dd",
+ "div",
+ "dl",
+ "dt",
+ "fieldset",
+ "figcaption",
+ "figure",
+ "footer",
+ "form",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "header",
+ "hr",
+ "li",
+ "main",
+ "nav",
+ "noscript",
+ "ol",
+ "p",
+ "pre",
+ "section",
+ "table",
+ "tfoot",
+ "ul",
+ "video",
+}
diff --git a/markdown/extractor.go b/markdown/extractor.go
new file mode 100644
index 0000000..bff5965
--- /dev/null
+++ b/markdown/extractor.go
@@ -0,0 +1,46 @@
+package markdown
+
+import (
+ "bytes"
+)
+
+// Extractor represents a Markdown specific plain text extractor with custom tags support.
+type Extractor struct {
+ tags []Tag
+}
+
+// NewExtractor creates a new Markdown Extractor instance with optional custom tags.
+func NewExtractor(customTags ...Tag) *Extractor {
+ extractor := &Extractor{
+ tags: tags,
+ }
+ extractor.tags = append(extractor.tags, customTags...)
+ return extractor
+}
+
+// AddTag adds a custom tag to the Markdown Extractor instance.
+func (e *Extractor) AddTag(tag Tag) *Extractor {
+ e.tags = append(e.tags, tag)
+ return e
+}
+
+// PlainText extracts plain text from the input Markdown string by removing the specified tags.
+// It replaces the full tags and their contents with the plain text content inside those tags.
+func (e *Extractor) PlainText(input string) (*string, error) {
+ tmp := []byte(input)
+ for _, tag := range e.tags {
+ listFullTag := tag.FullRegex.FindAll(tmp, -1)
+ for _, fullTag := range listFullTag {
+ var plain = fullTag
+ if tag.StartRegex != nil {
+ plain = tag.StartRegex.ReplaceAll(plain, nil)
+ }
+ if tag.EndRegex != nil {
+ plain = tag.EndRegex.ReplaceAll(plain, nil)
+ }
+ tmp = bytes.Replace(tmp, fullTag, plain, -1)
+ }
+ }
+ output := string(tmp)
+ return &output, nil
+}
diff --git a/markdown/extractor_test.go b/markdown/extractor_test.go
new file mode 100644
index 0000000..540476c
--- /dev/null
+++ b/markdown/extractor_test.go
@@ -0,0 +1,56 @@
+package markdown
+
+import (
+ "fmt"
+ "github.com/stretchr/testify/assert"
+ "regexp"
+ "testing"
+)
+
+func TestExtract(t *testing.T) {
+ tests := []struct {
+ input string
+ expected string
+ }{
+ {"~~strikethrough~~", "strikethrough"},
+ {"# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~", "H1 \nitalic bold code `not code link image strikethrough"},
+ {"# H1 \n new line", "H1 \n new line"},
+ {"*italic*", "italic"},
+ {"**bold**", "bold"},
+ {"`code`", "code"},
+ {"`node code", "`node code"},
+ {"[link](https://example.com)", "link"},
+ {"[image](https://image.com/image.png)", "image"},
+ }
+
+ markdownExtractor := NewExtractor()
+ for _, test := range tests {
+ output, err := markdownExtractor.PlainText(test.input)
+ assert.NoError(t, err)
+ assert.NotNil(t, output)
+ assert.Equal(t, test.expected, *output)
+ }
+ fmt.Println()
+}
+
+func TestCustomTag(t *testing.T) {
+ customTag := Tag{
+ Name: "color-custom-tag",
+ FullRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}(.*?){color}"),
+ StartRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}"),
+ EndRegex: regexp.MustCompile("{color}"),
+ }
+ markdownExtractor := NewExtractor(customTag)
+ tests := []struct {
+ input string
+ expected string
+ }{
+ {"This is {color:#0A84FF}red{color}", "This is red"},
+ }
+ for _, test := range tests {
+ output, err := markdownExtractor.PlainText(test.input)
+ assert.NoError(t, err)
+ assert.NotNil(t, output)
+ assert.Equal(t, test.expected, *output)
+ }
+}
diff --git a/markdown/tag.go b/markdown/tag.go
new file mode 100644
index 0000000..37b707b
--- /dev/null
+++ b/markdown/tag.go
@@ -0,0 +1,55 @@
+package markdown
+
+import "regexp"
+
+type Tag struct {
+ Name string
+ FullRegex *regexp.Regexp
+ StartRegex *regexp.Regexp
+ EndRegex *regexp.Regexp
+}
+
+var tags = []Tag{
+ {
+ Name: "Header",
+ FullRegex: regexp.MustCompile(`^#{1,6}\s+(.*)`),
+ StartRegex: regexp.MustCompile(`^#{1,6}\s+`),
+ EndRegex: nil,
+ },
+ {
+ Name: "Bold",
+ FullRegex: regexp.MustCompile(`\*\*(.*?)\*\*|__(.*?)__`),
+ StartRegex: regexp.MustCompile(`\*\*`),
+ EndRegex: regexp.MustCompile(`\*\*`),
+ },
+ {
+ Name: "Italic",
+ FullRegex: regexp.MustCompile(`\*(.*?)\*|_(.*?)_`),
+ StartRegex: regexp.MustCompile(`\*`),
+ EndRegex: regexp.MustCompile(`\*`),
+ },
+ {
+ Name: "Strikethrough",
+ FullRegex: regexp.MustCompile(`~~(.*?)~~`),
+ StartRegex: regexp.MustCompile(`^~~`),
+ EndRegex: regexp.MustCompile(`~~$`),
+ },
+ {
+ Name: "InlineCode",
+ FullRegex: regexp.MustCompile("\\`(.+?)\\`"),
+ StartRegex: regexp.MustCompile("\\`"),
+ EndRegex: regexp.MustCompile("\\`"),
+ },
+ {
+ Name: "Image",
+ FullRegex: regexp.MustCompile(`\!\[(.*?)\]\((.*?)\)`),
+ StartRegex: regexp.MustCompile(`\!\[`),
+ EndRegex: regexp.MustCompile(`\]\((.*?)\)`),
+ },
+ {
+ Name: "Link",
+ FullRegex: regexp.MustCompile(`\[(.*?)\]\((.*?)\)`),
+ StartRegex: regexp.MustCompile(`\[`),
+ EndRegex: regexp.MustCompile(`\]\((.*?)\)`),
+ },
+}
diff --git a/parser.go b/parser.go
new file mode 100644
index 0000000..d98df99
--- /dev/null
+++ b/parser.go
@@ -0,0 +1,51 @@
+package plaintext
+
+import (
+ "github.com/huantt/plaintext-extractor/html"
+ "github.com/huantt/plaintext-extractor/markdown"
+)
+
+// Extractor represents a plain text extractor that can parse input strings using multiple extract functions (for example html or markdown).
+type Extractor struct {
+ extractFuncs []ExtractFunc
+}
+
+// NewExtractor creates a new Extractor instance with the given extract function.
+func NewExtractor(extractFunc ExtractFunc, moreFuncs ...ExtractFunc) *Extractor {
+ extractor := &Extractor{
+ extractFuncs: []ExtractFunc{extractFunc},
+ }
+ extractor.extractFuncs = append(extractor.extractFuncs, moreFuncs...)
+ return extractor
+}
+
+func NewMarkdownExtractor(customTags ...markdown.Tag) *Extractor {
+ return NewExtractor(markdown.NewExtractor(customTags...).PlainText)
+}
+
+func NewHtmlExtractor(blockTags ...string) *Extractor {
+ return NewExtractor(html.NewExtractor(blockTags...).PlainText)
+}
+
+// AddExtractor adds an extract function to the Extractor instance.
+func (p *Extractor) AddExtractor(extractor ExtractFunc) *Extractor {
+ p.extractFuncs = append(p.extractFuncs, extractor)
+ return p
+}
+
+// ExtractFunc is the function signature for extracting plain text from a given input string.
+// Implement this function to extend availability of extracting plain text by passing into Extractor.AddExtractor function.
+type ExtractFunc func(input string) (*string, error)
+
+// PlainText extracts plain text from the input string using registered extract functions.
+// It iterates over all extract functions, applying them in sequence, and returns the final plain text.
+func (p *Extractor) PlainText(input string) (plainText *string, err error) {
+ plainText = &input
+ for _, extractFunc := range p.extractFuncs {
+ plainText, err = extractFunc(*plainText)
+ if err != nil {
+ return nil, err
+ }
+ }
+ return plainText, nil
+}
diff --git a/parser_test.go b/parser_test.go
new file mode 100644
index 0000000..fcbaec0
--- /dev/null
+++ b/parser_test.go
@@ -0,0 +1,59 @@
+package plaintext
+
+import (
+ "github.com/huantt/plaintext-extractor/html"
+ "github.com/huantt/plaintext-extractor/markdown"
+ "github.com/stretchr/testify/assert"
+ "testing"
+)
+
+func TestParseHtml(t *testing.T) {
+ tests := []struct {
+ input string
+ expected string
+ }{
+ {``, "This is a link"},
+ }
+ for _, test := range tests {
+ extractor := NewHtmlExtractor()
+ output, err := extractor.PlainText(test.input)
+ assert.NoError(t, err)
+ assert.NotNil(t, output)
+ assert.Equal(t, test.expected, *output)
+ }
+}
+
+func TestParseMarkdown(t *testing.T) {
+ tests := []struct {
+ input string
+ expected string
+ }{
+ {"# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~", "H1 \nitalic bold code `not code link image strikethrough"},
+ }
+ for _, test := range tests {
+ extractor := NewMarkdownExtractor()
+ output, err := extractor.PlainText(test.input)
+ assert.NoError(t, err)
+ assert.NotNil(t, output)
+ assert.Equal(t, test.expected, *output)
+ }
+}
+
+func TestMultipleExtractors(t *testing.T) {
+ tests := []struct {
+ input string
+ expected string
+ }{
+ {" html
*markdown*", "html markdown"},
+ {" *markdown in html*
", "markdown in html"},
+ }
+ for _, test := range tests {
+ markdownExtractor := markdown.NewExtractor()
+ htmlExtractor := html.NewExtractor()
+ extractor := NewExtractor(markdownExtractor.PlainText, htmlExtractor.PlainText)
+ output, err := extractor.PlainText(test.input)
+ assert.NoError(t, err)
+ assert.NotNil(t, output)
+ assert.Equal(t, test.expected, *output)
+ }
+}