Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve whitespace #64

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/html2text/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"fmt"
"os"

"jaytaylor.com/html2text"
"github.com/jaytaylor/html2text"
)

func main() {
Expand Down
14 changes: 14 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module github.com/jaytaylor/html2text

go 1.18

require (
github.com/olekukonko/tablewriter v0.0.5
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf
golang.org/x/net v0.8.0
)

require (
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
)
12 changes: 12 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
21 changes: 14 additions & 7 deletions html2text.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type Options struct {
PrettyTables bool // Turns on pretty ASCII rendering for table elements.
PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
OmitLinks bool // Turns on omitting links
PreserveWhitespace bool // Turns on whitespace preservation.
TextOnly bool // Returns only plain text
}

Expand Down Expand Up @@ -408,10 +409,10 @@ func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
return ctx.traverseChildren(node)

case html.TextNode:
var data string
data := node.Data
if ctx.isPre {
data = node.Data
} else {
} else if !ctx.options.PreserveWhitespace {
data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " "))
}
return ctx.emit(data)
Expand Down Expand Up @@ -442,19 +443,25 @@ func (ctx *textifyTraverseContext) emit(data string) error {
for _, line := range lines {
runes := []rune(line)
startsWithSpace := unicode.IsSpace(runes[0])
if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
missingSpaceSeperator := !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".")
if !ctx.options.PreserveWhitespace && missingSpaceSeperator {
if err = ctx.buf.WriteByte(' '); err != nil {
return err
}
ctx.lineLength++
}
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
for _, c := range line {
if _, err = ctx.buf.WriteString(string(c)); err != nil {
for _, line := range strings.SplitAfter(line, "\n") {
if len(line) == 0 {
continue
}

if _, err = ctx.buf.WriteString(line); err != nil {
return err
}
ctx.lineLength++
if c == '\n' {
ctx.lineLength += len(line)

if line[len(line)-1] == '\n' {
ctx.lineLength = 0
if ctx.prefix != "" {
if _, err = ctx.buf.WriteString(ctx.prefix); err != nil {
Expand Down
60 changes: 50 additions & 10 deletions html2text_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package html2text
package html2text_test

import (
"bytes"
Expand All @@ -9,6 +9,8 @@ import (
"regexp"
"strings"
"testing"

"github.com/jaytaylor/html2text"
)

const destPath = "testdata"
Expand Down Expand Up @@ -49,7 +51,7 @@ func TestParseUTF8(t *testing.T) {
if err != nil {
t.Fatal(err)
}
text, err := FromReader(bytes.NewReader(bs))
text, err := html2text.FromReader(bytes.NewReader(bs))
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -98,6 +100,44 @@ func TestStrippingWhitespace(t *testing.T) {
}
}

func TestPreservingWhitespace(t *testing.T) {
testCases := []struct {
input string
output string
}{
{
"test text",
"test text",
},
{
" \ttext\ntext\n",
"text\ntext",
},
{
" \na \n\t \n \n a \t",
"a \n\t \n\na",
},
{
"test text",
"test text",
},
{
"test    text ",
"test    text",
},
}

for _, testCase := range testCases {
if msg, err := wantString(testCase.input, testCase.output, html2text.Options{
PreserveWhitespace: true,
}); err != nil {
t.Error(err)
} else if len(msg) > 0 {
t.Log(msg)
}
}
}

func TestParagraphsAndBreaks(t *testing.T) {
testCases := []struct {
input string
Expand Down Expand Up @@ -333,9 +373,9 @@ Table 2 Header 1 Table 2 Header 2 Table 2 Footer 1 Table 2 Footer 2 Table 2 Row
}

for _, testCase := range testCases {
options := Options{
options := html2text.Options{
PrettyTables: true,
PrettyTablesOptions: NewPrettyTablesOptions(),
PrettyTablesOptions: html2text.NewPrettyTablesOptions(),
}
// Check pretty tabular ASCII version.
if msg, err := wantString(testCase.input, testCase.tabularOutput, options); err != nil {
Expand Down Expand Up @@ -513,7 +553,7 @@ func TestOmitLinks(t *testing.T) {
}

for _, testCase := range testCases {
if msg, err := wantString(testCase.input, testCase.output, Options{OmitLinks: true}); err != nil {
if msg, err := wantString(testCase.input, testCase.output, html2text.Options{OmitLinks: true}); err != nil {
t.Error(err)
} else if len(msg) > 0 {
t.Log(msg)
Expand Down Expand Up @@ -904,16 +944,16 @@ func (m ExactStringMatcher) String() string {
return string(m)
}

func wantRegExp(input string, outputRE string, options ...Options) (string, error) {
func wantRegExp(input string, outputRE string, options ...html2text.Options) (string, error) {
return match(input, RegexpStringMatcher(outputRE), options...)
}

func wantString(input string, output string, options ...Options) (string, error) {
func wantString(input string, output string, options ...html2text.Options) (string, error) {
return match(input, ExactStringMatcher(output), options...)
}

func match(input string, matcher StringMatcher, options ...Options) (string, error) {
text, err := FromString(input, options...)
func match(input string, matcher StringMatcher, options ...html2text.Options) (string, error) {
text, err := html2text.FromString(input, options...)
if err != nil {
return "", err
}
Expand Down Expand Up @@ -1000,7 +1040,7 @@ func Example() {
</body>
</html>`

text, err := FromString(inputHTML, Options{PrettyTables: true})
text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true})
if err != nil {
panic(err)
}
Expand Down