From 9cb03025cecd36144a4732a1d089eaa07828739c Mon Sep 17 00:00:00 2001 From: Gareth Watts Date: Sat, 25 Mar 2017 22:38:32 -0700 Subject: [PATCH 1/2] Make PDF header parser more tolerant Not all PDF content producers strictly adhere to the spec recommendations. --- read.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/read.go b/read.go index eb8b9aa..665861c 100644 --- a/read.go +++ b/read.go @@ -124,9 +124,11 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { buf := make([]byte, 10) f.ReadAt(buf, 0) - if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' { + + if !bytes.HasPrefix(buf, []byte("%PDF-1.")) { return nil, fmt.Errorf("not a PDF file: invalid header") } + end := size const endChunk = 100 buf = make([]byte, endChunk) From 56940c70edec9e73084e1a0ec7d26b65cc7856e0 Mon Sep 17 00:00:00 2001 From: Gareth Watts Date: Sat, 25 Mar 2017 22:38:37 -0700 Subject: [PATCH 2/2] Basic support for the CCITTFaxDecode filter Allow reading of embedded faxes by wrapping the data with a suitable tiff header. Does not do any parsing of the data, but should be sufficient to read fax imagery in most cases, I think. --- read.go | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/read.go b/read.go index 665861c..ec1e435 100644 --- a/read.go +++ b/read.go @@ -67,6 +67,7 @@ import ( "crypto/cipher" "crypto/md5" "crypto/rc4" + "encoding/binary" "fmt" "io" "io/ioutil" @@ -824,6 +825,10 @@ func applyFilter(rd io.Reader, name string, param Value) io.Reader { switch name { default: panic("unknown filter " + name) + + case "CCITTFaxDecode": + return faxReader(rd, param) + case "FlateDecode": zr, err := zlib.NewReader(rd) if err != nil { @@ -1079,3 +1084,75 @@ func (r *cbcReader) Read(b []byte) (n int, err error) { r.pend = r.pend[n:] return n, nil } + +const tiffTagCount = 8 + +type tiffTag struct { + FieldTag uint16 + FieldType uint16 + FieldLength uint32 + DataOffset uint32 +} + +type tiffHeader struct { + ByteOrder [2]byte // II == Intel byte order + Version uint16 // always 42 + ImgDirOffset uint32 + TagCount uint16 + Tags [tiffTagCount]tiffTag + TagTerm uint32 +} + +// faxReader wraps a bare-bones tiff header around CCITT fax data +// this does not support two dimensional encoded faxes (K>1) +// or anything that requires actually decoding the data +// (EndOfLine, EncodedByteAlign, EndOfBlock, DamagedRowsBeforeError parameters) +// or a missing row count. +func faxReader(rd io.Reader, param Value) io.Reader { + k := param.Key("K").Int64() + rows := param.Key("Rows").Int64() + cols := param.Key("Columns").Int64() + + if cols == 0 { + cols = 1728 // per spec + } + + var comp uint32 + switch { + case k < 0: + comp = 4 // CCITT Group 4 + case k == 0: + comp = 3 // CCITT Group 3 + default: + panic("unsupported encoding scheme") + } + + if rows <= 0 || cols <= 0 { + panic("invalid row/column count for fax data") + } + + // must know the actual amount of data + data, _ := ioutil.ReadAll(rd) + header := tiffHeader{ + ByteOrder: [...]byte{'I', 'I'}, + Version: 42, + ImgDirOffset: 8, + TagCount: tiffTagCount, + Tags: [...]tiffTag{ + {256, 4, 1, uint32(cols)}, // ImageWidth + {257, 4, 1, uint32(rows)}, // ImageHeight + {258, 3, 1, 1}, // BitsPerSample + {259, 3, 1, uint32(comp)}, // Compression, 3 == Group 3, 4 == Group 4 + {262, 3, 1, 0}, // PhotometricInterpretation, 0 = WhiteIsZero + {273, 4, 1, uint32(binary.Size(tiffHeader{}))}, // StripOffsets + {278, 4, 1, uint32(rows)}, // RowsPerStrip + {279, 4, 1, uint32(len(data))}, // StripByteCounts, size of image + }, + } + + var buf bytes.Buffer + if err := binary.Write(&buf, binary.LittleEndian, header); err != nil { + panic(fmt.Sprintf("binary write failed: %v", err)) + } + return io.MultiReader(&buf, bytes.NewReader(data)) +}