From 7dfab7ec0dbe357f5a992ee028509abec3a5f966 Mon Sep 17 00:00:00 2001
From: Luca Zambarda <luca@blokur.com>
Date: Tue, 15 Mar 2022 14:50:06 +0000
Subject: [PATCH 1/6] fix(read): correctly use logic operator in header
 condition

---
 read.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read.go b/read.go
index 0d03e3f..c0572af 100644
--- a/read.go
+++ b/read.go
@@ -129,7 +129,7 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) {
 func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) {
 	buf := make([]byte, 10)
 	f.ReadAt(buf, 0)
-	if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' {
+	if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' || buf[8] != '\n' {
 		return nil, fmt.Errorf("not a PDF file: invalid header")
 	}
 	end := size

From a962e8469e5a2223c432c5ceddf96c4b610795f8 Mon Sep 17 00:00:00 2001
From: Luca Zambarda <luca@blokur.com>
Date: Tue, 15 Mar 2022 15:25:30 +0000
Subject: [PATCH 2/6] refactor(read): make header validation more flexible

---
 read.go      | 10 ++++++++--
 read_test.go | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 read_test.go

diff --git a/read.go b/read.go
index c0572af..d21f954 100644
--- a/read.go
+++ b/read.go
@@ -72,6 +72,7 @@ import (
 	"io"
 	"io/ioutil"
 	"os"
+	"regexp"
 	"sort"
 	"strconv"
 )
@@ -122,14 +123,19 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) {
 	return NewReaderEncrypted(f, size, nil)
 }
 
+// headerRegexp is used to check the validity of the header line of a PDF.
+// This should be able to support extra spaces between the version and the
+// newline (as inserted by libtiff/tiff2pdf) as well as supporting CRLF and LF.
+var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`)
+
 // NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
 // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords
 // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt
 // the file and returns an error.
 func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) {
-	buf := make([]byte, 10)
+	buf := make([]byte, 11)
 	f.ReadAt(buf, 0)
-	if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' || buf[8] != '\n' {
+	if !headerRegexp.Match(buf) {
 		return nil, fmt.Errorf("not a PDF file: invalid header")
 	}
 	end := size
diff --git a/read_test.go b/read_test.go
new file mode 100644
index 0000000..9a387db
--- /dev/null
+++ b/read_test.go
@@ -0,0 +1,54 @@
+package pdf
+
+import (
+	"testing"
+)
+
+func TestRead(t *testing.T) {
+	t.Run("HeaderValidation", testHeaderValidation)
+}
+
+func testHeaderValidation(t *testing.T) {
+	tscs := map[string]struct {
+		input         []byte
+		expectedValid bool
+	}{
+		"nil": {
+			input:         nil,
+			expectedValid: false,
+		},
+		"empty": {
+			input:         []byte{},
+			expectedValid: false,
+		},
+		"missing LF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55},
+			expectedValid: false,
+		},
+		"ok LF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55, 10},
+			expectedValid: true,
+		},
+		"invalid version 1.8": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 58, 10},
+			expectedValid: false,
+		},
+		"ok CRLF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55, 13, 10},
+			expectedValid: true,
+		},
+		"ok space + CRLF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55, 32, 13, 10},
+			expectedValid: true,
+		},
+	}
+	for name, data := range tscs {
+		data := data
+		t.Run(name, func(t *testing.T) {
+			gotValid := headerRegexp.Match(data.input)
+			if gotValid != data.expectedValid {
+				t.Errorf("expected %t, got %t", data.expectedValid, gotValid)
+			}
+		})
+	}
+}

From 0468b6602c1cde4467a2871350ad0c86e16753e6 Mon Sep 17 00:00:00 2001
From: Luca Zambarda <luca@blokur.com>
Date: Tue, 15 Mar 2022 15:26:07 +0000
Subject: [PATCH 3/6] chore: add go.mod

---
 go.mod | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 go.mod

diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..f306e2e
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/ledongthuc/pdf
+
+go 1.17

From 7b000bd9ba69ecdb40d5144edd659ba38cd9f056 Mon Sep 17 00:00:00 2001
From: Luca Zambarda <luca@blokur.com>
Date: Tue, 15 Mar 2022 15:59:07 +0000
Subject: [PATCH 4/6] refactor(read): make EOF and startxref read more flexible

---
 read.go | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/read.go b/read.go
index d21f954..f8a9224 100644
--- a/read.go
+++ b/read.go
@@ -133,21 +133,23 @@ var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`)
 // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt
 // the file and returns an error.
 func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) {
+	const headerLen = 11
 	buf := make([]byte, 11)
 	f.ReadAt(buf, 0)
 	if !headerRegexp.Match(buf) {
 		return nil, fmt.Errorf("not a PDF file: invalid header")
 	}
 	end := size
-	const endChunk = 100
+	// https://stackoverflow.com/questions/11896858/does-the-eof-in-a-pdf-have-to-appear-within-the-last-1024-bytes-of-the-file
+	const endChunk = 1024
 	buf = make([]byte, endChunk)
-	f.ReadAt(buf, end-endChunk)
-	for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' {
-		buf = buf[:len(buf)-1]
+	_, err := f.ReadAt(buf, end-endChunk)
+	if err != nil {
+		return nil, err
 	}
-	buf = bytes.TrimRight(buf, "\r\n\t ")
-	if !bytes.HasSuffix(buf, []byte("%%EOF")) {
-		return nil, fmt.Errorf("not a PDF file: missing %%%%EOF")
+	const eof = "%%EOF"
+	if findLastLine(buf, eof) < 0 {
+		return nil, fmt.Errorf("not a PDF file: missing %s", eof)
 	}
 	i := findLastLine(buf, "startxref")
 	if i < 0 {
@@ -436,6 +438,8 @@ func readXrefTableData(b *buffer, table []xref) ([]xref, error) {
 	return table, nil
 }
 
+// findLastLine looks for the last index of s in the given buffer. The search
+// term must be alone in the line (surrounded by newlines).
 func findLastLine(buf []byte, s string) int {
 	bs := []byte(s)
 	max := len(buf)
@@ -444,7 +448,14 @@ func findLastLine(buf []byte, s string) int {
 		if i <= 0 || i+len(bs) >= len(buf) {
 			return -1
 		}
-		if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') {
+		if buf[i-1] == '\n' || buf[i-1] == '\r' {
+			return i
+		}
+		if buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r' {
+			return i
+		}
+		// libtiff/tiff2pdf can add an extra space before the newline
+		if buf[i+len(bs)] == ' ' || buf[i+len(bs)+1] == '\n' || buf[i+len(bs)+1] == '\r' {
 			return i
 		}
 		max = i

From e3c03b546d854b0a255081fbdde7dd2b02306723 Mon Sep 17 00:00:00 2001
From: Luca Zambarda <luca@blokur.com>
Date: Wed, 16 Mar 2022 11:45:02 +0000
Subject: [PATCH 5/6] fix(page): correctly handle panic in GetTextByRow

---
 page.go | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/page.go b/page.go
index b4245dd..d50f45c 100644
--- a/page.go
+++ b/page.go
@@ -643,9 +643,8 @@ type Row struct {
 type Rows []*Row
 
 // GetTextByRow returns the page's all text grouped by rows
-func (p Page) GetTextByRow() (Rows, error) {
-	result := Rows{}
-	var err error
+func (p Page) GetTextByRow() (result Rows, err error) {
+	result = Rows{}
 
 	defer func() {
 		if r := recover(); r != nil {

From 63ec7d0da1bab8b6d6085fcc6627a49a30a50cfe Mon Sep 17 00:00:00 2001
From: Luca Zambarda <luca@blokur.com>
Date: Wed, 16 Mar 2022 11:49:13 +0000
Subject: [PATCH 6/6] docs: add useful references in readme

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 5d72176..ea57334 100644
--- a/README.md
+++ b/README.md
@@ -136,3 +136,11 @@ func readPdf(path string) (string, error) {
 
 ## Demo
 ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif)
+
+## References
+
+List of useful references to how the PDF file format is structured:
+
+* https://web.archive.org/web/20210128014024/https://www.adobe.com/content/dam/acom/en/devnet/pdf/PDF32000_2008.pdf
+* https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
+* https://commandlinefanatic.com/cgi-bin/showarticle.cgi?article=art019