feat: init project

huantt · Jul 27, 2023 · 4e4e37a · 4e4e37a
commit 4e4e37a
Show file tree

Hide file tree

Showing 16 changed files with 561 additions and 0 deletions.
diff --git a/.gihub/workflows/release.yml b/.gihub/workflows/release.yml
@@ -0,0 +1,15 @@
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  release-on-push:
+    permissions: write-all
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: rymndhng/release-on-push-action@master
+        with:
+          bump_version_scheme: minor
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.idea/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,12 @@
+## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
+Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. Issue that pull request!
+
+## License
+By contributing, you agree that your contributions will be licensed under its MIT License.
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2018 GitHub, Inc. and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,85 @@
+# Plain Text Extractor
+[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+[![Go Report Card](https://goreportcard.com/badge/github.com/huantt/plaintext-parser)](https://goreportcard.com/report/github.com/huantt/plaintext-parser)
+
+Plain Text Extractor is a Golang library that helps you extract plain text from `HTML` and `Markdown`. 
+
+It provides a flexible and extensible interface for extracting the plain text content using both the predefined extraction methods and your own custom extraction requirements.
+
+## Features
+- Parse HTML and Markdown documents into plain text.
+- Support for custom extraction functions.
+- Easy-to-use API to convert complex documents to simple plain text.
+
+## Installation
+```shell
+go get github.com/huantt/plaintext-extractror
+```
+
+## Usage
+### Markdown extractor
+```go
+markdownContent := "# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~"
+extractor := NewMarkdownExtractor()
+output, err := extractor.PlainText(markdownContent)
+if err != nil {
+    panic(err)
+}
+fmt.Println(output)
+// Output: H1 \nitalic bold code `not code link image strikethrough
+```
+
+### Custom Markdown Tag
+```go
+markdownContent := "This is {color:#0A84FF}red{color}"
+
+customTag := markdown.Tag{
+    Name:       "color-custom-tag",
+    FullRegex:  regexp.MustCompile("{color:[a-zA-Z0-9#]+}(.*?){color}"),
+    StartRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}"),
+    EndRegex:   regexp.MustCompile("{color}"),
+}
+
+markdownExtractor := NewMarkdownExtractor(customTag)
+plaintextExtractor := plaintext.NewExtractor(markdownExtractor.PlainText)
+plaintext, err := plaintextExtractor.PlainText(markdownContent)
+if err != nil{
+    panic(nil)
+}
+fmt.Println(plaintext)
+// Output: This is red
+```
+
+### HTML Extractor
+```go
+html := `<div>This is a <a href="https://example.com">link</a></div>`
+extractor := NewHtmlExtractor()
+output, err := extractor.PlainText(html)
+if err != nil {
+    panic(err)
+}
+fmt.Println(output)
+// Output: This is a link
+```
+
+### Multiple extractors
+```go
+input := `<div> html </div> *markdown*`
+markdownExtractor := markdown.NewExtractor()
+htmlExtractor := html.NewExtractor()
+extractor := NewExtractor(markdownExtractor.PlainText, htmlExtractor.PlainText)
+output, err := extractor.PlainText(input)
+if err != nil {
+    panic(err)
+}
+fmt.Println(output)
+// Output: html markdown
+```
+
+## Contribution
+Contributions to the Plain Text Parser project are welcome!
+If you find any issues or want to add new features, please feel free to open an issue or submit a pull request. 
+Please see the [CONTRIBUTING.md](./CONTRIBUTING.md) for more information.
+
+## License
+This project released under the MIT License, refer [LICENSE](./LICENSE.md) file.
diff --git a/go.mod b/go.mod
@@ -0,0 +1,14 @@
+module github.com/huantt/plaintext-extractor
+
+go 1.18
+
+require (
+	github.com/stretchr/testify v1.8.0
+	golang.org/x/net v0.12.0
+)
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
diff --git a/go.sum b/go.sum
@@ -0,0 +1,17 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50=
+golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/html/.keep b/html/.keep
diff --git a/html/extractor.go b/html/extractor.go
@@ -0,0 +1,65 @@
+package html
+
+import (
+	"golang.org/x/net/html"
+	"strings"
+)
+
+// HTMLExtractor represents an HTML-specific plain text extractor.
+type Extractor struct {
+	blockTags map[string]bool
+}
+
+// NewExtractor creates a new HTMLExtractor instance.
+func NewExtractor(otherBlockTags ...string) *Extractor {
+	uniqueBlockTags := map[string]bool{}
+	for _, tag := range blockTags {
+		uniqueBlockTags[tag] = true
+	}
+	for _, tag := range otherBlockTags {
+		uniqueBlockTags[tag] = true
+	}
+
+	return &Extractor{blockTags: uniqueBlockTags}
+}
+
+// PlainText extracts plain text from the input HTML string.
+func (e *Extractor) PlainText(input string) (*string, error) {
+	doc, err := html.Parse(strings.NewReader(input))
+	if err != nil {
+		return nil, err
+	}
+
+	var plainText strings.Builder
+	e.extractText(&plainText, doc)
+
+	output := plainText.String()
+	output = strings.ReplaceAll(output, "\n ", "\n")
+	return &output, nil
+}
+
+// Recursively extract plain text from the HTML nodes.
+func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
+	if node.Type == html.TextNode {
+		// Trim and append the text content
+		text := strings.TrimSpace(node.Data)
+		if text != "" {
+			if plainText.Len() > 0 {
+				if found := e.blockTags[node.Parent.DataAtom.String()]; found {
+					plainText.WriteString("\n")
+				} else {
+					plainText.WriteString(" ")
+				}
+			}
+			plainText.WriteString(text)
+		}
+	}
+	if node.DataAtom.String() == "br" {
+		plainText.WriteString("\n")
+		return
+	}
+
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		e.extractText(plainText, child)
+	}
+}
diff --git a/html/extractor_test.go b/html/extractor_test.go
@@ -0,0 +1,26 @@
+package html
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func TestExtract(t *testing.T) {
+	extractor := NewExtractor()
+	tests := []struct {
+		input    string
+		expected string
+	}{
+		{`a<br>b`, "a\nb"},
+		{`a<br><h1>b</h1>`, "a\n\nb"},
+		{`<a href="https://example.com">link</a>`, "link"},
+		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link"},
+		{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ul><li>Item 1</li><li>Item 2</li></ul></div>", "Heading 1\nHeading 2\nItem 1\nItem 2"},
+	}
+	for _, test := range tests {
+		output, err := extractor.PlainText(test.input)
+		assert.NoError(t, err)
+		assert.NotNil(t, output)
+		assert.Equal(t, test.expected, *output)
+	}
+}
diff --git a/html/html.go b/html/html.go
@@ -0,0 +1,38 @@
+package html
+
+var blockTags = []string{
+	"address",
+	"article",
+	"aside",
+	"blockquote",
+	"canvas",
+	"dd",
+	"div",
+	"dl",
+	"dt",
+	"fieldset",
+	"figcaption",
+	"figure",
+	"footer",
+	"form",
+	"h1",
+	"h2",
+	"h3",
+	"h4",
+	"h5",
+	"h6",
+	"header",
+	"hr",
+	"li",
+	"main",
+	"nav",
+	"noscript",
+	"ol",
+	"p",
+	"pre",
+	"section",
+	"table",
+	"tfoot",
+	"ul",
+	"video",
+}
diff --git a/markdown/extractor.go b/markdown/extractor.go
@@ -0,0 +1,46 @@
+package markdown
+
+import (
+	"bytes"
+)
+
+// Extractor represents a Markdown specific plain text extractor with custom tags support.
+type Extractor struct {
+	tags []Tag
+}
+
+// NewExtractor creates a new Markdown Extractor instance with optional custom tags.
+func NewExtractor(customTags ...Tag) *Extractor {
+	extractor := &Extractor{
+		tags: tags,
+	}
+	extractor.tags = append(extractor.tags, customTags...)
+	return extractor
+}
+
+// AddTag adds a custom tag to the Markdown Extractor instance.
+func (e *Extractor) AddTag(tag Tag) *Extractor {
+	e.tags = append(e.tags, tag)
+	return e
+}
+
+// PlainText extracts plain text from the input Markdown string by removing the specified tags.
+// It replaces the full tags and their contents with the plain text content inside those tags.
+func (e *Extractor) PlainText(input string) (*string, error) {
+	tmp := []byte(input)
+	for _, tag := range e.tags {
+		listFullTag := tag.FullRegex.FindAll(tmp, -1)
+		for _, fullTag := range listFullTag {
+			var plain = fullTag
+			if tag.StartRegex != nil {
+				plain = tag.StartRegex.ReplaceAll(plain, nil)
+			}
+			if tag.EndRegex != nil {
+				plain = tag.EndRegex.ReplaceAll(plain, nil)
+			}
+			tmp = bytes.Replace(tmp, fullTag, plain, -1)
+		}
+	}
+	output := string(tmp)
+	return &output, nil
+}