From 7dfab7ec0dbe357f5a992ee028509abec3a5f966 Mon Sep 17 00:00:00 2001 From: Luca Zambarda Date: Tue, 15 Mar 2022 14:50:06 +0000 Subject: [PATCH 1/6] fix(read): correctly use logic operator in header condition --- read.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/read.go b/read.go index 0d03e3f..c0572af 100644 --- a/read.go +++ b/read.go @@ -129,7 +129,7 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { buf := make([]byte, 10) f.ReadAt(buf, 0) - if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' { + if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' || buf[8] != '\n' { return nil, fmt.Errorf("not a PDF file: invalid header") } end := size From a962e8469e5a2223c432c5ceddf96c4b610795f8 Mon Sep 17 00:00:00 2001 From: Luca Zambarda Date: Tue, 15 Mar 2022 15:25:30 +0000 Subject: [PATCH 2/6] refactor(read): make header validation more flexible --- read.go | 10 ++++++++-- read_test.go | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 read_test.go diff --git a/read.go b/read.go index c0572af..d21f954 100644 --- a/read.go +++ b/read.go @@ -72,6 +72,7 @@ import ( "io" "io/ioutil" "os" + "regexp" "sort" "strconv" ) @@ -122,14 +123,19 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { return NewReaderEncrypted(f, size, nil) } +// headerRegexp is used to check the validity of the header line of a PDF. +// This should be able to support extra spaces between the version and the +// newline (as inserted by libtiff/tiff2pdf) as well as supporting CRLF and LF. +var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`) + // NewReaderEncrypted opens a file for reading, using the data in f with the given total size. // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt // the file and returns an error. func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { - buf := make([]byte, 10) + buf := make([]byte, 11) f.ReadAt(buf, 0) - if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' || buf[8] != '\n' { + if !headerRegexp.Match(buf) { return nil, fmt.Errorf("not a PDF file: invalid header") } end := size diff --git a/read_test.go b/read_test.go new file mode 100644 index 0000000..9a387db --- /dev/null +++ b/read_test.go @@ -0,0 +1,54 @@ +package pdf + +import ( + "testing" +) + +func TestRead(t *testing.T) { + t.Run("HeaderValidation", testHeaderValidation) +} + +func testHeaderValidation(t *testing.T) { + tscs := map[string]struct { + input []byte + expectedValid bool + }{ + "nil": { + input: nil, + expectedValid: false, + }, + "empty": { + input: []byte{}, + expectedValid: false, + }, + "missing LF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55}, + expectedValid: false, + }, + "ok LF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 10}, + expectedValid: true, + }, + "invalid version 1.8": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 58, 10}, + expectedValid: false, + }, + "ok CRLF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 13, 10}, + expectedValid: true, + }, + "ok space + CRLF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 32, 13, 10}, + expectedValid: true, + }, + } + for name, data := range tscs { + data := data + t.Run(name, func(t *testing.T) { + gotValid := headerRegexp.Match(data.input) + if gotValid != data.expectedValid { + t.Errorf("expected %t, got %t", data.expectedValid, gotValid) + } + }) + } +} From 0468b6602c1cde4467a2871350ad0c86e16753e6 Mon Sep 17 00:00:00 2001 From: Luca Zambarda Date: Tue, 15 Mar 2022 15:26:07 +0000 Subject: [PATCH 3/6] chore: add go.mod --- go.mod | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 go.mod diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f306e2e --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/ledongthuc/pdf + +go 1.17 From 7b000bd9ba69ecdb40d5144edd659ba38cd9f056 Mon Sep 17 00:00:00 2001 From: Luca Zambarda Date: Tue, 15 Mar 2022 15:59:07 +0000 Subject: [PATCH 4/6] refactor(read): make EOF and startxref read more flexible --- read.go | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/read.go b/read.go index d21f954..f8a9224 100644 --- a/read.go +++ b/read.go @@ -133,21 +133,23 @@ var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`) // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt // the file and returns an error. func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { + const headerLen = 11 buf := make([]byte, 11) f.ReadAt(buf, 0) if !headerRegexp.Match(buf) { return nil, fmt.Errorf("not a PDF file: invalid header") } end := size - const endChunk = 100 + // https://stackoverflow.com/questions/11896858/does-the-eof-in-a-pdf-have-to-appear-within-the-last-1024-bytes-of-the-file + const endChunk = 1024 buf = make([]byte, endChunk) - f.ReadAt(buf, end-endChunk) - for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' { - buf = buf[:len(buf)-1] + _, err := f.ReadAt(buf, end-endChunk) + if err != nil { + return nil, err } - buf = bytes.TrimRight(buf, "\r\n\t ") - if !bytes.HasSuffix(buf, []byte("%%EOF")) { - return nil, fmt.Errorf("not a PDF file: missing %%%%EOF") + const eof = "%%EOF" + if findLastLine(buf, eof) < 0 { + return nil, fmt.Errorf("not a PDF file: missing %s", eof) } i := findLastLine(buf, "startxref") if i < 0 { @@ -436,6 +438,8 @@ func readXrefTableData(b *buffer, table []xref) ([]xref, error) { return table, nil } +// findLastLine looks for the last index of s in the given buffer. The search +// term must be alone in the line (surrounded by newlines). func findLastLine(buf []byte, s string) int { bs := []byte(s) max := len(buf) @@ -444,7 +448,14 @@ func findLastLine(buf []byte, s string) int { if i <= 0 || i+len(bs) >= len(buf) { return -1 } - if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') { + if buf[i-1] == '\n' || buf[i-1] == '\r' { + return i + } + if buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r' { + return i + } + // libtiff/tiff2pdf can add an extra space before the newline + if buf[i+len(bs)] == ' ' || buf[i+len(bs)+1] == '\n' || buf[i+len(bs)+1] == '\r' { return i } max = i From e3c03b546d854b0a255081fbdde7dd2b02306723 Mon Sep 17 00:00:00 2001 From: Luca Zambarda Date: Wed, 16 Mar 2022 11:45:02 +0000 Subject: [PATCH 5/6] fix(page): correctly handle panic in GetTextByRow --- page.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/page.go b/page.go index b4245dd..d50f45c 100644 --- a/page.go +++ b/page.go @@ -643,9 +643,8 @@ type Row struct { type Rows []*Row // GetTextByRow returns the page's all text grouped by rows -func (p Page) GetTextByRow() (Rows, error) { - result := Rows{} - var err error +func (p Page) GetTextByRow() (result Rows, err error) { + result = Rows{} defer func() { if r := recover(); r != nil { From 63ec7d0da1bab8b6d6085fcc6627a49a30a50cfe Mon Sep 17 00:00:00 2001 From: Luca Zambarda Date: Wed, 16 Mar 2022 11:49:13 +0000 Subject: [PATCH 6/6] docs: add useful references in readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 5d72176..ea57334 100644 --- a/README.md +++ b/README.md @@ -136,3 +136,11 @@ func readPdf(path string) (string, error) { ## Demo ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) + +## References + +List of useful references to how the PDF file format is structured: + +* https://web.archive.org/web/20210128014024/https://www.adobe.com/content/dam/acom/en/devnet/pdf/PDF32000_2008.pdf +* https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html +* https://commandlinefanatic.com/cgi-bin/showarticle.cgi?article=art019