diff --git a/README.md b/README.md index 5d72176..ea57334 100644 --- a/README.md +++ b/README.md @@ -136,3 +136,11 @@ func readPdf(path string) (string, error) { ## Demo ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) + +## References + +List of useful references to how the PDF file format is structured: + +* https://web.archive.org/web/20210128014024/https://www.adobe.com/content/dam/acom/en/devnet/pdf/PDF32000_2008.pdf +* https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html +* https://commandlinefanatic.com/cgi-bin/showarticle.cgi?article=art019 diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f306e2e --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/ledongthuc/pdf + +go 1.17 diff --git a/page.go b/page.go index b4245dd..d50f45c 100644 --- a/page.go +++ b/page.go @@ -643,9 +643,8 @@ type Row struct { type Rows []*Row // GetTextByRow returns the page's all text grouped by rows -func (p Page) GetTextByRow() (Rows, error) { - result := Rows{} - var err error +func (p Page) GetTextByRow() (result Rows, err error) { + result = Rows{} defer func() { if r := recover(); r != nil { diff --git a/read.go b/read.go index 0d03e3f..f8a9224 100644 --- a/read.go +++ b/read.go @@ -72,6 +72,7 @@ import ( "io" "io/ioutil" "os" + "regexp" "sort" "strconv" ) @@ -122,26 +123,33 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { return NewReaderEncrypted(f, size, nil) } +// headerRegexp is used to check the validity of the header line of a PDF. +// This should be able to support extra spaces between the version and the +// newline (as inserted by libtiff/tiff2pdf) as well as supporting CRLF and LF. +var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`) + // NewReaderEncrypted opens a file for reading, using the data in f with the given total size. // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt // the file and returns an error. func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { - buf := make([]byte, 10) + const headerLen = 11 + buf := make([]byte, 11) f.ReadAt(buf, 0) - if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' { + if !headerRegexp.Match(buf) { return nil, fmt.Errorf("not a PDF file: invalid header") } end := size - const endChunk = 100 + // https://stackoverflow.com/questions/11896858/does-the-eof-in-a-pdf-have-to-appear-within-the-last-1024-bytes-of-the-file + const endChunk = 1024 buf = make([]byte, endChunk) - f.ReadAt(buf, end-endChunk) - for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' { - buf = buf[:len(buf)-1] + _, err := f.ReadAt(buf, end-endChunk) + if err != nil { + return nil, err } - buf = bytes.TrimRight(buf, "\r\n\t ") - if !bytes.HasSuffix(buf, []byte("%%EOF")) { - return nil, fmt.Errorf("not a PDF file: missing %%%%EOF") + const eof = "%%EOF" + if findLastLine(buf, eof) < 0 { + return nil, fmt.Errorf("not a PDF file: missing %s", eof) } i := findLastLine(buf, "startxref") if i < 0 { @@ -430,6 +438,8 @@ func readXrefTableData(b *buffer, table []xref) ([]xref, error) { return table, nil } +// findLastLine looks for the last index of s in the given buffer. The search +// term must be alone in the line (surrounded by newlines). func findLastLine(buf []byte, s string) int { bs := []byte(s) max := len(buf) @@ -438,7 +448,14 @@ func findLastLine(buf []byte, s string) int { if i <= 0 || i+len(bs) >= len(buf) { return -1 } - if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') { + if buf[i-1] == '\n' || buf[i-1] == '\r' { + return i + } + if buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r' { + return i + } + // libtiff/tiff2pdf can add an extra space before the newline + if buf[i+len(bs)] == ' ' || buf[i+len(bs)+1] == '\n' || buf[i+len(bs)+1] == '\r' { return i } max = i diff --git a/read_test.go b/read_test.go new file mode 100644 index 0000000..9a387db --- /dev/null +++ b/read_test.go @@ -0,0 +1,54 @@ +package pdf + +import ( + "testing" +) + +func TestRead(t *testing.T) { + t.Run("HeaderValidation", testHeaderValidation) +} + +func testHeaderValidation(t *testing.T) { + tscs := map[string]struct { + input []byte + expectedValid bool + }{ + "nil": { + input: nil, + expectedValid: false, + }, + "empty": { + input: []byte{}, + expectedValid: false, + }, + "missing LF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55}, + expectedValid: false, + }, + "ok LF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 10}, + expectedValid: true, + }, + "invalid version 1.8": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 58, 10}, + expectedValid: false, + }, + "ok CRLF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 13, 10}, + expectedValid: true, + }, + "ok space + CRLF": { + input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 32, 13, 10}, + expectedValid: true, + }, + } + for name, data := range tscs { + data := data + t.Run(name, func(t *testing.T) { + gotValid := headerRegexp.Match(data.input) + if gotValid != data.expectedValid { + t.Errorf("expected %t, got %t", data.expectedValid, gotValid) + } + }) + } +}