From e1f48ce6ecefe29f5ae6bf288bcb6545a957cf02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Sun, 10 May 2020 17:55:13 +0200 Subject: [PATCH 1/6] Add go.mod --- go.mod | 9 +++++++++ go.sum | 12 ++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 go.mod create mode 100644 go.sum diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..996fb08 --- /dev/null +++ b/go.mod @@ -0,0 +1,9 @@ +module github.com/jaytaylor/html2text + +go 1.14 + +require ( + github.com/olekukonko/tablewriter v0.0.4 + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf + golang.org/x/net v0.0.0-20200506145744-7e3656a0809f +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..16034cd --- /dev/null +++ b/go.sum @@ -0,0 +1,12 @@ +github.com/mattn/go-runewidth v0.0.7 h1:Ei8KR0497xHyKJPAv59M1dkC+rOZCMBJ+t3fZ+twI54= +github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8= +github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f h1:QBjCr1Fz5kw158VqdE9JfI9cJnl/ymnJWAdMuinqL7Y= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= From 5e13366bfd92809c1f866c36280e6c8962fd6aa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Sat, 24 Jul 2021 13:04:44 +0200 Subject: [PATCH 2/6] Update dependencies --- go.mod | 4 ++-- go.sum | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 996fb08..4d6c48f 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/jaytaylor/html2text go 1.14 require ( - github.com/olekukonko/tablewriter v0.0.4 + github.com/olekukonko/tablewriter v0.0.5 github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf - golang.org/x/net v0.0.0-20200506145744-7e3656a0809f + golang.org/x/net v0.0.0-20210614182718-04defd469f4e ) diff --git a/go.sum b/go.sum index 16034cd..8f25d0c 100644 --- a/go.sum +++ b/go.sum @@ -1,12 +1,13 @@ -github.com/mattn/go-runewidth v0.0.7 h1:Ei8KR0497xHyKJPAv59M1dkC+rOZCMBJ+t3fZ+twI54= -github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= -github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8= -github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA= +github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= +github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= +github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/net v0.0.0-20200506145744-7e3656a0809f h1:QBjCr1Fz5kw158VqdE9JfI9cJnl/ymnJWAdMuinqL7Y= -golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q= +golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= From 7dffa160e3085d91c7057aa764b0293d1a59a192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Tue, 28 Mar 2023 19:53:46 +0200 Subject: [PATCH 3/6] Update deps; update to go-1.18 --- go.mod | 9 +++++++-- go.sum | 15 +++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 4d6c48f..f866373 100644 --- a/go.mod +++ b/go.mod @@ -1,9 +1,14 @@ module github.com/jaytaylor/html2text -go 1.14 +go 1.18 require ( github.com/olekukonko/tablewriter v0.0.5 github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf - golang.org/x/net v0.0.0-20210614182718-04defd469f4e + golang.org/x/net v0.8.0 +) + +require ( + github.com/mattn/go-runewidth v0.0.14 // indirect + github.com/rivo/uniseg v0.4.4 // indirect ) diff --git a/go.sum b/go.sum index 8f25d0c..acf09ad 100644 --- a/go.sum +++ b/go.sum @@ -1,13 +1,12 @@ -github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= +github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= +github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= -golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q= -golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ= +golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= From 9cf34e917d28ca9c635130528d330ad4b921a1ea Mon Sep 17 00:00:00 2001 From: Michael Kuc Date: Tue, 15 Aug 2023 15:33:53 +0000 Subject: [PATCH 4/6] Migrate cmd to new module. Migrates the HTML2Text command to the new go module convention. --- cmd/html2text/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/html2text/main.go b/cmd/html2text/main.go index e287c18..3ac3ad8 100644 --- a/cmd/html2text/main.go +++ b/cmd/html2text/main.go @@ -5,7 +5,7 @@ import ( "fmt" "os" - "jaytaylor.com/html2text" + "github.com/jaytaylor/html2text" ) func main() { From b9546a9fd0526869f870553d5a30ac7451cf6a8a Mon Sep 17 00:00:00 2001 From: Michael Kuc Date: Tue, 15 Aug 2023 15:32:22 +0000 Subject: [PATCH 5/6] Add test module. --- html2text_test.go | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/html2text_test.go b/html2text_test.go index 452b45e..efd838a 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -1,4 +1,4 @@ -package html2text +package html2text_test import ( "bytes" @@ -9,6 +9,8 @@ import ( "regexp" "strings" "testing" + + "github.com/jaytaylor/html2text" ) const destPath = "testdata" @@ -49,7 +51,7 @@ func TestParseUTF8(t *testing.T) { if err != nil { t.Fatal(err) } - text, err := FromReader(bytes.NewReader(bs)) + text, err := html2text.FromReader(bytes.NewReader(bs)) if err != nil { t.Fatal(err) } @@ -333,9 +335,9 @@ Table 2 Header 1 Table 2 Header 2 Table 2 Footer 1 Table 2 Footer 2 Table 2 Row } for _, testCase := range testCases { - options := Options{ + options := html2text.Options{ PrettyTables: true, - PrettyTablesOptions: NewPrettyTablesOptions(), + PrettyTablesOptions: html2text.NewPrettyTablesOptions(), } // Check pretty tabular ASCII version. if msg, err := wantString(testCase.input, testCase.tabularOutput, options); err != nil { @@ -513,7 +515,7 @@ func TestOmitLinks(t *testing.T) { } for _, testCase := range testCases { - if msg, err := wantString(testCase.input, testCase.output, Options{OmitLinks: true}); err != nil { + if msg, err := wantString(testCase.input, testCase.output, html2text.Options{OmitLinks: true}); err != nil { t.Error(err) } else if len(msg) > 0 { t.Log(msg) @@ -904,16 +906,16 @@ func (m ExactStringMatcher) String() string { return string(m) } -func wantRegExp(input string, outputRE string, options ...Options) (string, error) { +func wantRegExp(input string, outputRE string, options ...html2text.Options) (string, error) { return match(input, RegexpStringMatcher(outputRE), options...) } -func wantString(input string, output string, options ...Options) (string, error) { +func wantString(input string, output string, options ...html2text.Options) (string, error) { return match(input, ExactStringMatcher(output), options...) } -func match(input string, matcher StringMatcher, options ...Options) (string, error) { - text, err := FromString(input, options...) +func match(input string, matcher StringMatcher, options ...html2text.Options) (string, error) { + text, err := html2text.FromString(input, options...) if err != nil { return "", err } @@ -1000,7 +1002,7 @@ func Example() { ` - text, err := FromString(inputHTML, Options{PrettyTables: true}) + text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true}) if err != nil { panic(err) } From 9492f433606b37213c28efc9cdffd648740d7125 Mon Sep 17 00:00:00 2001 From: Michael Kuc Date: Tue, 15 Aug 2023 16:14:16 +0000 Subject: [PATCH 6/6] Add whitespace perservation. Adds whitespace preservation as a parsing option. This allows the output text to better reflect the input whitespace formatting. --- html2text.go | 21 ++++++++++++++------- html2text_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/html2text.go b/html2text.go index 8fe9000..4d8183b 100644 --- a/html2text.go +++ b/html2text.go @@ -18,6 +18,7 @@ type Options struct { PrettyTables bool // Turns on pretty ASCII rendering for table elements. PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements. OmitLinks bool // Turns on omitting links + PreserveWhitespace bool // Turns on whitespace preservation. TextOnly bool // Returns only plain text } @@ -408,10 +409,10 @@ func (ctx *textifyTraverseContext) traverse(node *html.Node) error { return ctx.traverseChildren(node) case html.TextNode: - var data string + data := node.Data if ctx.isPre { data = node.Data - } else { + } else if !ctx.options.PreserveWhitespace { data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " ")) } return ctx.emit(data) @@ -442,19 +443,25 @@ func (ctx *textifyTraverseContext) emit(data string) error { for _, line := range lines { runes := []rune(line) startsWithSpace := unicode.IsSpace(runes[0]) - if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") { + missingSpaceSeperator := !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") + if !ctx.options.PreserveWhitespace && missingSpaceSeperator { if err = ctx.buf.WriteByte(' '); err != nil { return err } ctx.lineLength++ } ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) - for _, c := range line { - if _, err = ctx.buf.WriteString(string(c)); err != nil { + for _, line := range strings.SplitAfter(line, "\n") { + if len(line) == 0 { + continue + } + + if _, err = ctx.buf.WriteString(line); err != nil { return err } - ctx.lineLength++ - if c == '\n' { + ctx.lineLength += len(line) + + if line[len(line)-1] == '\n' { ctx.lineLength = 0 if ctx.prefix != "" { if _, err = ctx.buf.WriteString(ctx.prefix); err != nil { diff --git a/html2text_test.go b/html2text_test.go index efd838a..5fd0ddd 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -100,6 +100,44 @@ func TestStrippingWhitespace(t *testing.T) { } } +func TestPreservingWhitespace(t *testing.T) { + testCases := []struct { + input string + output string + }{ + { + "test text", + "test text", + }, + { + " \ttext\ntext\n", + "text\ntext", + }, + { + " \na \n\t \n \n a \t", + "a \n\t \n\na", + }, + { + "test text", + "test text", + }, + { + "test    text ", + "test    text", + }, + } + + for _, testCase := range testCases { + if msg, err := wantString(testCase.input, testCase.output, html2text.Options{ + PreserveWhitespace: true, + }); err != nil { + t.Error(err) + } else if len(msg) > 0 { + t.Log(msg) + } + } +} + func TestParagraphsAndBreaks(t *testing.T) { testCases := []struct { input string