diff --git a/.gihub/workflows/release.yml b/.gihub/workflows/release.yml new file mode 100644 index 0000000..97df1af --- /dev/null +++ b/.gihub/workflows/release.yml @@ -0,0 +1,15 @@ +on: + push: + branches: + - main + +jobs: + release-on-push: + permissions: write-all + runs-on: ubuntu-latest + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: rymndhng/release-on-push-action@master + with: + bump_version_scheme: minor diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..62c8935 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..b6d3e20 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,12 @@ +## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests +Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: + +1. Fork the repo and create your branch from `main`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. Issue that pull request! + +## License +By contributing, you agree that your contributions will be licensed under its MIT License. \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..b85cb2f --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2018 GitHub, Inc. and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7eab989 --- /dev/null +++ b/README.md @@ -0,0 +1,85 @@ +# Plain Text Extractor +[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![Go Report Card](https://goreportcard.com/badge/github.com/huantt/plaintext-parser)](https://goreportcard.com/report/github.com/huantt/plaintext-parser) + +Plain Text Extractor is a Golang library that helps you extract plain text from `HTML` and `Markdown`. + +It provides a flexible and extensible interface for extracting the plain text content using both the predefined extraction methods and your own custom extraction requirements. + +## Features +- Parse HTML and Markdown documents into plain text. +- Support for custom extraction functions. +- Easy-to-use API to convert complex documents to simple plain text. + +## Installation +```shell +go get github.com/huantt/plaintext-extractror +``` + +## Usage +### Markdown extractor +```go +markdownContent := "# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~" +extractor := NewMarkdownExtractor() +output, err := extractor.PlainText(markdownContent) +if err != nil { + panic(err) +} +fmt.Println(output) +// Output: H1 \nitalic bold code `not code link image strikethrough +``` + +### Custom Markdown Tag +```go +markdownContent := "This is {color:#0A84FF}red{color}" + +customTag := markdown.Tag{ + Name: "color-custom-tag", + FullRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}(.*?){color}"), + StartRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}"), + EndRegex: regexp.MustCompile("{color}"), +} + +markdownExtractor := NewMarkdownExtractor(customTag) +plaintextExtractor := plaintext.NewExtractor(markdownExtractor.PlainText) +plaintext, err := plaintextExtractor.PlainText(markdownContent) +if err != nil{ + panic(nil) +} +fmt.Println(plaintext) +// Output: This is red +``` + +### HTML Extractor +```go +html := `
This is a link
` +extractor := NewHtmlExtractor() +output, err := extractor.PlainText(html) +if err != nil { + panic(err) +} +fmt.Println(output) +// Output: This is a link +``` + +### Multiple extractors +```go +input := `
html
*markdown*` +markdownExtractor := markdown.NewExtractor() +htmlExtractor := html.NewExtractor() +extractor := NewExtractor(markdownExtractor.PlainText, htmlExtractor.PlainText) +output, err := extractor.PlainText(input) +if err != nil { + panic(err) +} +fmt.Println(output) +// Output: html markdown +``` + +## Contribution +Contributions to the Plain Text Parser project are welcome! +If you find any issues or want to add new features, please feel free to open an issue or submit a pull request. +Please see the [CONTRIBUTING.md](./CONTRIBUTING.md) for more information. + +## License +This project released under the MIT License, refer [LICENSE](./LICENSE.md) file. \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..58a980f --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module github.com/huantt/plaintext-extractor + +go 1.18 + +require ( + github.com/stretchr/testify v1.8.0 + golang.org/x/net v0.12.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..fc6abb3 --- /dev/null +++ b/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= +golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/html/.keep b/html/.keep new file mode 100644 index 0000000..e69de29 diff --git a/html/extractor.go b/html/extractor.go new file mode 100644 index 0000000..ebd87f2 --- /dev/null +++ b/html/extractor.go @@ -0,0 +1,65 @@ +package html + +import ( + "golang.org/x/net/html" + "strings" +) + +// HTMLExtractor represents an HTML-specific plain text extractor. +type Extractor struct { + blockTags map[string]bool +} + +// NewExtractor creates a new HTMLExtractor instance. +func NewExtractor(otherBlockTags ...string) *Extractor { + uniqueBlockTags := map[string]bool{} + for _, tag := range blockTags { + uniqueBlockTags[tag] = true + } + for _, tag := range otherBlockTags { + uniqueBlockTags[tag] = true + } + + return &Extractor{blockTags: uniqueBlockTags} +} + +// PlainText extracts plain text from the input HTML string. +func (e *Extractor) PlainText(input string) (*string, error) { + doc, err := html.Parse(strings.NewReader(input)) + if err != nil { + return nil, err + } + + var plainText strings.Builder + e.extractText(&plainText, doc) + + output := plainText.String() + output = strings.ReplaceAll(output, "\n ", "\n") + return &output, nil +} + +// Recursively extract plain text from the HTML nodes. +func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) { + if node.Type == html.TextNode { + // Trim and append the text content + text := strings.TrimSpace(node.Data) + if text != "" { + if plainText.Len() > 0 { + if found := e.blockTags[node.Parent.DataAtom.String()]; found { + plainText.WriteString("\n") + } else { + plainText.WriteString(" ") + } + } + plainText.WriteString(text) + } + } + if node.DataAtom.String() == "br" { + plainText.WriteString("\n") + return + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + e.extractText(plainText, child) + } +} diff --git a/html/extractor_test.go b/html/extractor_test.go new file mode 100644 index 0000000..98fb648 --- /dev/null +++ b/html/extractor_test.go @@ -0,0 +1,26 @@ +package html + +import ( + "github.com/stretchr/testify/assert" + "testing" +) + +func TestExtract(t *testing.T) { + extractor := NewExtractor() + tests := []struct { + input string + expected string + }{ + {`a
b`, "a\nb"}, + {`a

b

`, "a\n\nb"}, + {`link`, "link"}, + {`
This is a link
`, "This is a link"}, + {"

Heading 1

Heading 2

", "Heading 1\nHeading 2\nItem 1\nItem 2"}, + } + for _, test := range tests { + output, err := extractor.PlainText(test.input) + assert.NoError(t, err) + assert.NotNil(t, output) + assert.Equal(t, test.expected, *output) + } +} diff --git a/html/html.go b/html/html.go new file mode 100644 index 0000000..1494dd6 --- /dev/null +++ b/html/html.go @@ -0,0 +1,38 @@ +package html + +var blockTags = []string{ + "address", + "article", + "aside", + "blockquote", + "canvas", + "dd", + "div", + "dl", + "dt", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "header", + "hr", + "li", + "main", + "nav", + "noscript", + "ol", + "p", + "pre", + "section", + "table", + "tfoot", + "ul", + "video", +} diff --git a/markdown/extractor.go b/markdown/extractor.go new file mode 100644 index 0000000..bff5965 --- /dev/null +++ b/markdown/extractor.go @@ -0,0 +1,46 @@ +package markdown + +import ( + "bytes" +) + +// Extractor represents a Markdown specific plain text extractor with custom tags support. +type Extractor struct { + tags []Tag +} + +// NewExtractor creates a new Markdown Extractor instance with optional custom tags. +func NewExtractor(customTags ...Tag) *Extractor { + extractor := &Extractor{ + tags: tags, + } + extractor.tags = append(extractor.tags, customTags...) + return extractor +} + +// AddTag adds a custom tag to the Markdown Extractor instance. +func (e *Extractor) AddTag(tag Tag) *Extractor { + e.tags = append(e.tags, tag) + return e +} + +// PlainText extracts plain text from the input Markdown string by removing the specified tags. +// It replaces the full tags and their contents with the plain text content inside those tags. +func (e *Extractor) PlainText(input string) (*string, error) { + tmp := []byte(input) + for _, tag := range e.tags { + listFullTag := tag.FullRegex.FindAll(tmp, -1) + for _, fullTag := range listFullTag { + var plain = fullTag + if tag.StartRegex != nil { + plain = tag.StartRegex.ReplaceAll(plain, nil) + } + if tag.EndRegex != nil { + plain = tag.EndRegex.ReplaceAll(plain, nil) + } + tmp = bytes.Replace(tmp, fullTag, plain, -1) + } + } + output := string(tmp) + return &output, nil +} diff --git a/markdown/extractor_test.go b/markdown/extractor_test.go new file mode 100644 index 0000000..540476c --- /dev/null +++ b/markdown/extractor_test.go @@ -0,0 +1,56 @@ +package markdown + +import ( + "fmt" + "github.com/stretchr/testify/assert" + "regexp" + "testing" +) + +func TestExtract(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"~~strikethrough~~", "strikethrough"}, + {"# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~", "H1 \nitalic bold code `not code link image strikethrough"}, + {"# H1 \n new line", "H1 \n new line"}, + {"*italic*", "italic"}, + {"**bold**", "bold"}, + {"`code`", "code"}, + {"`node code", "`node code"}, + {"[link](https://example.com)", "link"}, + {"[image](https://image.com/image.png)", "image"}, + } + + markdownExtractor := NewExtractor() + for _, test := range tests { + output, err := markdownExtractor.PlainText(test.input) + assert.NoError(t, err) + assert.NotNil(t, output) + assert.Equal(t, test.expected, *output) + } + fmt.Println() +} + +func TestCustomTag(t *testing.T) { + customTag := Tag{ + Name: "color-custom-tag", + FullRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}(.*?){color}"), + StartRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}"), + EndRegex: regexp.MustCompile("{color}"), + } + markdownExtractor := NewExtractor(customTag) + tests := []struct { + input string + expected string + }{ + {"This is {color:#0A84FF}red{color}", "This is red"}, + } + for _, test := range tests { + output, err := markdownExtractor.PlainText(test.input) + assert.NoError(t, err) + assert.NotNil(t, output) + assert.Equal(t, test.expected, *output) + } +} diff --git a/markdown/tag.go b/markdown/tag.go new file mode 100644 index 0000000..37b707b --- /dev/null +++ b/markdown/tag.go @@ -0,0 +1,55 @@ +package markdown + +import "regexp" + +type Tag struct { + Name string + FullRegex *regexp.Regexp + StartRegex *regexp.Regexp + EndRegex *regexp.Regexp +} + +var tags = []Tag{ + { + Name: "Header", + FullRegex: regexp.MustCompile(`^#{1,6}\s+(.*)`), + StartRegex: regexp.MustCompile(`^#{1,6}\s+`), + EndRegex: nil, + }, + { + Name: "Bold", + FullRegex: regexp.MustCompile(`\*\*(.*?)\*\*|__(.*?)__`), + StartRegex: regexp.MustCompile(`\*\*`), + EndRegex: regexp.MustCompile(`\*\*`), + }, + { + Name: "Italic", + FullRegex: regexp.MustCompile(`\*(.*?)\*|_(.*?)_`), + StartRegex: regexp.MustCompile(`\*`), + EndRegex: regexp.MustCompile(`\*`), + }, + { + Name: "Strikethrough", + FullRegex: regexp.MustCompile(`~~(.*?)~~`), + StartRegex: regexp.MustCompile(`^~~`), + EndRegex: regexp.MustCompile(`~~$`), + }, + { + Name: "InlineCode", + FullRegex: regexp.MustCompile("\\`(.+?)\\`"), + StartRegex: regexp.MustCompile("\\`"), + EndRegex: regexp.MustCompile("\\`"), + }, + { + Name: "Image", + FullRegex: regexp.MustCompile(`\!\[(.*?)\]\((.*?)\)`), + StartRegex: regexp.MustCompile(`\!\[`), + EndRegex: regexp.MustCompile(`\]\((.*?)\)`), + }, + { + Name: "Link", + FullRegex: regexp.MustCompile(`\[(.*?)\]\((.*?)\)`), + StartRegex: regexp.MustCompile(`\[`), + EndRegex: regexp.MustCompile(`\]\((.*?)\)`), + }, +} diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..d98df99 --- /dev/null +++ b/parser.go @@ -0,0 +1,51 @@ +package plaintext + +import ( + "github.com/huantt/plaintext-extractor/html" + "github.com/huantt/plaintext-extractor/markdown" +) + +// Extractor represents a plain text extractor that can parse input strings using multiple extract functions (for example html or markdown). +type Extractor struct { + extractFuncs []ExtractFunc +} + +// NewExtractor creates a new Extractor instance with the given extract function. +func NewExtractor(extractFunc ExtractFunc, moreFuncs ...ExtractFunc) *Extractor { + extractor := &Extractor{ + extractFuncs: []ExtractFunc{extractFunc}, + } + extractor.extractFuncs = append(extractor.extractFuncs, moreFuncs...) + return extractor +} + +func NewMarkdownExtractor(customTags ...markdown.Tag) *Extractor { + return NewExtractor(markdown.NewExtractor(customTags...).PlainText) +} + +func NewHtmlExtractor(blockTags ...string) *Extractor { + return NewExtractor(html.NewExtractor(blockTags...).PlainText) +} + +// AddExtractor adds an extract function to the Extractor instance. +func (p *Extractor) AddExtractor(extractor ExtractFunc) *Extractor { + p.extractFuncs = append(p.extractFuncs, extractor) + return p +} + +// ExtractFunc is the function signature for extracting plain text from a given input string. +// Implement this function to extend availability of extracting plain text by passing into Extractor.AddExtractor function. +type ExtractFunc func(input string) (*string, error) + +// PlainText extracts plain text from the input string using registered extract functions. +// It iterates over all extract functions, applying them in sequence, and returns the final plain text. +func (p *Extractor) PlainText(input string) (plainText *string, err error) { + plainText = &input + for _, extractFunc := range p.extractFuncs { + plainText, err = extractFunc(*plainText) + if err != nil { + return nil, err + } + } + return plainText, nil +} diff --git a/parser_test.go b/parser_test.go new file mode 100644 index 0000000..fcbaec0 --- /dev/null +++ b/parser_test.go @@ -0,0 +1,59 @@ +package plaintext + +import ( + "github.com/huantt/plaintext-extractor/html" + "github.com/huantt/plaintext-extractor/markdown" + "github.com/stretchr/testify/assert" + "testing" +) + +func TestParseHtml(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {`
This is a link
`, "This is a link"}, + } + for _, test := range tests { + extractor := NewHtmlExtractor() + output, err := extractor.PlainText(test.input) + assert.NoError(t, err) + assert.NotNil(t, output) + assert.Equal(t, test.expected, *output) + } +} + +func TestParseMarkdown(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~", "H1 \nitalic bold code `not code link image strikethrough"}, + } + for _, test := range tests { + extractor := NewMarkdownExtractor() + output, err := extractor.PlainText(test.input) + assert.NoError(t, err) + assert.NotNil(t, output) + assert.Equal(t, test.expected, *output) + } +} + +func TestMultipleExtractors(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"
html
*markdown*", "html markdown"}, + {"
*markdown in html*
", "markdown in html"}, + } + for _, test := range tests { + markdownExtractor := markdown.NewExtractor() + htmlExtractor := html.NewExtractor() + extractor := NewExtractor(markdownExtractor.PlainText, htmlExtractor.PlainText) + output, err := extractor.PlainText(test.input) + assert.NoError(t, err) + assert.NotNil(t, output) + assert.Equal(t, test.expected, *output) + } +}