-
Notifications
You must be signed in to change notification settings - Fork 1
/
arithmetic.go
583 lines (492 loc) · 18.7 KB
/
arithmetic.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
package protean
import "errors"
// Here is some background reading on arithmetic coding and range coding.
// http://www.arturocampos.com/ac_arithmetic.html
// http://www.arturocampos.com/ac_range.html
// http://www.compressconsult.com/rangecoder/
// http://sachingarg.com/compression/entropy_coding/range_coder.pdf
// http://ezcodesample.com/reanatomy.html
// http://www.cc.gatech.edu/~jarek/courses/7491/Arithmetic2.pdf
// http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251?pgno=2
// Summarized from "A Fast Renormalisation Method for Arithmetic Coding" by
// Michael Schindler:
//
// At any point during arithmetic coding the output consists of four parts:
// 1. The part already written into the output buffer and does not change.
// 2. One digit that may be changed by at most one carry when adding to the
// lower end of the interval. There will never be two carries. since the
// range when fixing that digit was <= 1 unit. Two carries would require
// range > 1 unit.
// 3. There is a (possibly empty) block of digits that pass on a carry. (255 for
// bytes) that are represented by a counter counting their number.
// 4. The last part is represented by the low end range variable of the encoder.
// Returns the sum of a list of numbers
func sum(items []uint32) uint32 {
var accum uint32
for _, item := range items {
accum = accum + item
}
return accum
}
// Takes an input list of integer and returns a list of integers where all of
// the input integer have been divided by a constant.
func scale(items []uint32, divisor uint32) []uint32 {
var results []uint32 = make([]uint32, len(items))
for index, item := range items {
scaled := item / divisor
if scaled == 0 {
results[index] = 1
} else {
results[index] = scaled
}
}
return results
}
// Takes a list of numbers where the inputs should all be integers from 0 to
// 255 and converts them to bytes in an []byte.
func SaveProbs(items []uint32) ([]byte, error) {
bs := make([]byte, len(items))
for index, item := range items {
if item >= 0 && item <= 255 {
bs[index] = byte(item)
} else {
return nil, errors.New("Probabilities must be between 0 and 255 inclusive.")
}
}
return bs, nil
}
// Models a symbol as an interval in the arithmetic coding space
type Interval struct {
// The byte that corresponds to this interval in the coding
symbol uint8
// The lower range of the interval
low uint32
// The length of this interval
length uint32
// The upper range of this interval
// This should always be lower+length
// This is precomputed and stored separately in order to increase the clarity
// of the implementation of the coding algorithm.
high uint32
}
// Creates a new interval
// This maintains the constraint that high = low + length.
func makeInterval(symbol uint8, low uint32, length uint32) Interval {
return Interval{symbol: symbol, low: low, length: length, high: low + length}
}
// The precision of the arithmetic coder
const CODE_BITS uint32 = 32
// The maximum possible signed value given the precision of the coder.
// 2^(32-1)
const TOP_VALUE uint32 = 2147483648
// The maximum possible unsigned value given the precision of the coder.
// (2^32)-1
const MAX_INT uint32 = 4294967295
// The number of bits to shift over during renormalization.
const SHIFT_BITS uint32 = CODE_BITS - 9
// Tne number of bits left over during renormalization.
const EXTRA_BITS uint32 = (CODE_BITS-2)%8 + 1
// The lowest possible value.
// Anything lower than this will be shifted up during renormalization.
const BOTTOM_VALUE uint32 = TOP_VALUE >> 8
// The state and initialiation code for arithmetic coding.
// This class is never instantiated directly.
// The subclasses Encoder and Decoder are used instead.
type Coder struct {
// The probability distribution of the input.
// This will be a list of 256 entries, each consisting of an integer from
// 0 to 255.
probabilities []uint32
// The low end of the encoded range. Starts at 0.
low uint32
// The high end of the encoded range. Starts at the maximum 32-bit value.
high uint32
// The extra bits that need to be stored for later if underflow occurs.
underflow uint32
// The current byte that's being constructed for eventual output.
working uint32
// A coding table derived from the probability distribution.
intervals map[uint8]Interval
// The total of the lengths of all intervals in the coding table.
// This determines the maximum amount that the range can change by encoding
// one symbol.
total uint32
// The input buffer. This is a list of bytes represented as numbers.
input []uint32
// The output buffer. This is a list of bytes represented as numbers.
output []uint32
}
// The Coder constructor normalizes the symbol probabilities and build the
// coding table.
func NewCoder(probs []uint32) Coder {
this := Coder{}
// Scale the symbol probabilities to fit constraints.
this.probabilities = adjustProbs(probs)
this.low = 0x00000000
this.high = 0xFFFFFFFF
this.intervals = make(map[uint8]Interval)
// Build the symbol table.
var low uint32
for index, prob := range probs {
this.intervals[uint8(index)] = makeInterval(uint8(index), low, prob)
low = low + prob
}
// Calculate the sum of the lengths of all intervals.
this.total = sum(this.probabilities)
return this
}
func max(items []uint32) uint32 {
var top uint32
for _, item := range items {
if item > top {
top = item
}
}
return top
}
// Scale the symbol probabilities to fit the following constraints:
// - No probability can be higher than 255.
// - The sum of all probabilities must be less than 2^14.
func adjustProbs(probs []uint32) []uint32 {
// The maximum value for any single probability
const MAX_PROB uint32 = 255
// The amount to scale probabilities if they are greater than the maximum.
const SCALER uint32 = 256
// The maximum value for the sum of all probabilities. 2^14
const MAX_SUM uint32 = 16384
// If any single probability is too high, rescale.
var highestProb = max(probs)
if highestProb > MAX_PROB {
divisor := highestProb / SCALER
probs = scale(probs, divisor)
}
// If the sum of probabilities is too high, rescale.
for sum(probs) >= MAX_SUM {
probs = scale(probs, 2)
}
return probs
}
// Encodes a sequence of bytes using a probability distribution with the goal of
// yielding a higher entropy sequence of bytes.
type Encoder struct {
// extends Coder
Coder
}
func NewEncoder(probs []uint32) Encoder {
return Encoder{Coder: NewCoder(probs)}
}
func NewDecoder(probs []uint32) Decoder {
return Decoder{Coder: NewCoder(probs)}
}
// Encode a sequence of bytes.
func (this *Encoder) Encode(input []byte) []byte {
// Initialize state.
// The Coder superclass initializes state common to Encoder and Decoder.
// Encoder and Decoder do some additional initialization that must be
// reset when encoding each byte sequence.
this.init()
// Encode all of the symbols in the input []byte
// The primary effect is to fill up the output buffer with output bytes.
// Internal state variables also change after encoding each symbol.
for _, b := range input {
this.encodeSymbol(b)
}
// Flush any remaining state in the internal state variables into
// the output buffer.
this.flush()
// Copy the output buffer into an []byte that can be returned.
var output = make([]byte, len(this.output))
for index, item := range this.output {
output[index] = byte(item)
}
// Return the []byte copy of the internal output buffer.
return output
}
// Initialize state.
// The Coder superclass initializes state common to Encoer and Decoder.
// Encoder and Decoder do some additional initialization that must be
// reset when encoding each byte sequence.
func (this *Encoder) init() {
this.low = 0
this.high = TOP_VALUE
this.working = 0xCA
this.underflow = 0
this.input = []uint32{}
this.output = []uint32{}
}
// Encode a symbol. The symbol is a byte represented as a number.
// The effect of this is to change internal state variables.
// As a consequence, bytes may of may not be written to the output buffer.
// When all symbols have been encoded, flush() must be called to recover any
// remaining state.
func (this *Encoder) encodeSymbol(symbol uint8) {
// Look up the corresponding interval for the symbol in the coding table.
// This is what we actually use for encoding.
interval := this.intervals[symbol]
// Renormalize. This is the complicated but less interesting part of coding.
// This is also where bytes are actually written to the output buffer.
this.renormalize()
// Now do the interesting part of arithmetic coding.
// Every sequence of symbols is mapped to a positive integer.
// As we encode each symbol we are calculating the digits of this integer
// using the interval information for the symbol.
// The result of encoding a symbol is a new range, as represented by are new
// values for low and high.
// The new symbol subdivides the existing range.
// Take the existing range and subdivide it by the total length of the
// intervals in the coding table.
newRange := this.high / this.total
// Find the place in the new subdivide range where the new symbol's interval
// begins.
temp := newRange * interval.low
// The case where the symbol being encoded has the highest range is a
// special case.
if interval.high >= this.total {
// Special case where the symbol being encoded has the highest range
// Adjust the high part of the range
this.high = this.high - temp
} else {
// General case
// Adjust the high part of the range
this.high = newRange * interval.length
}
// Adjust the low part of the range
this.low = this.low + temp
}
// Summarized from "A Fast Renormalisation Method for Arithmetic Coding" by
// Michael Schindler:
//
// When doing encoding renormalisation the following can happen:
// A No renormalisation is needed since the range is in the desired interval.
// B The low end plus the range (this is the upper end of the interval) will
// not produce any carry. In this case the second and third part can be
// output as they will never change. The digit produced will become part two
// and part three will be empty.
// C The low end has already produced a carry. Here the (changed) second and
// third part can be output. There will not be another carry. Set the second
// and third part as before.
// D The digit produced will pass on a possible future carry, so it is added
// to the third block.
func (this *Encoder) renormalize() {
// If renormalization is needed, we are in case B, C, or D.
// Otherwise, we are in case A.
for this.high <= BOTTOM_VALUE {
if this.low < (0xFF << SHIFT_BITS) {
// B The low end plus the range (this is the upper end of the interval) will
// not produce any carry. In this case the second and third part can be
// output as they will never change. The digit produced will become part two
// and part three will be empty.
this.write(this.working)
for this.underflow != 0 {
this.underflow = this.underflow - 1
this.write(0xFF)
}
this.working = (this.low >> SHIFT_BITS) & 0xFF
} else if (this.low & TOP_VALUE) != 0 {
// C The low end has already produced a carry. Here the (changed) second and
// third part can be output. There will not be another carry. Set the second
// and third part as before.
this.write(this.working + 1)
for this.underflow != 0 {
this.underflow = this.underflow - 1
this.write(0x00)
}
this.working = (this.low >> SHIFT_BITS) & 0xFF
} else {
// D The digit produced will pass on a possible future carry, so it is added
// to the third block.
this.underflow = this.underflow + 1
}
// This is the goal of renormalization, to move the whole range over 8
// bits in order to make room for more computation.
this.high = (this.high << 8) >> 0
this.low = ((this.low << 8) & (TOP_VALUE - 1)) >> 0
}
// A No renormalisation is needed since the range is in the desired interval.
}
func (this *Encoder) flush() {
// Output the internal state variables.
this.renormalize()
var temp = this.low >> SHIFT_BITS
if temp > 0xFF {
this.write(this.working + 1)
for this.underflow != 0 {
this.underflow = this.underflow - 1
this.write(0x00)
}
} else {
this.write(this.working)
for this.underflow != 0 {
this.underflow = this.underflow - 1
this.write(0xFF)
}
}
// Output the remaining internal state.
this.write(temp & 0xFF)
this.write((this.low >> (23 - 8)) & 0xFF)
// Output the length
this.write((uint32(len(this.output)) >> uint32(8)) & uint32(0xFF))
this.write(uint32(len(this.output)) & uint32(0xFF))
}
func (this *Encoder) write(b uint32) {
this.output = append(this.output, b)
}
// Decodes a sequence of bytes using a probability distribution with the goal of
// yielding a lower entropy sequence of bytes.
type Decoder struct {
//extends Coder
Coder
}
// Decode a sequence of bytes
func (this *Decoder) Decode(input []byte) []byte {
// Create an empty input buffer.)
this.input = make([]uint32, len(input))
// Fetch the size of the target output.
// This is encoded as two bytes at the end of the encoded byte sequence.
var sizeBytes = input[len(input)-2:]
// Decode the two-byte size into a number.
var size = decodeShort(sizeBytes) - 4
// Copy the bytes from the given []byte into the internal input buffer.
for index := uint16(0); index < uint16(len(input)); index++ {
this.input[index] = uint32(input[index])
}
// Initialize state.
// The Coder superclass initializes state common to Encoder and Decoder.
// Encoder and Decoder do some additional initialization that must be
// reset when encoding each byte sequence.
this.init()
// Decode all of the symbols in the input buffer
// The primary effect is to fill up the output buffer with output bytes.
// Internal state variables also change after decoding each symbol.
this.decodeSymbols()
// Flush any remaining state in the internal state variables into
// the output buffer.
this.flush()
// Copy the output buffer into an []byte that can be returned.
output := make([]byte, len(this.output))
for index, item := range this.output {
output[index] = byte(item)
}
if uint16(len(output)) > size {
output = output[:size]
}
return output
}
// Initialize state variables for decoding.
func (this *Decoder) init() {
// Discard first byte because the encoder is weird.
this.input = this.input[1:]
this.working = this.input[0]
this.input = this.input[1:]
this.low = this.working >> (8 - EXTRA_BITS)
this.high = 1 << EXTRA_BITS
this.underflow = 0
this.output = []uint32{}
}
// Decode symbols from the input buffer until it is empty.
func (this *Decoder) decodeSymbols() {
for len(this.input) > 0 {
this.decodeSymbol()
}
}
// Run the decoding algorithm. This uses internal state variables and
// may or may not consume bytes from the input buffer.
// The primary result of running this is changing internal state variables
// and one byte will always be written to the output buffer.
// After decoding symbols, flush must be called to get the remaining state
// out of the internal state variables.
func (this *Decoder) decodeSymbol() {
// Renormalize. This is the complicated but less interesting part of coding.
// This is also where bytes are actually read from the input buffer.
this.renormalize()
//
this.underflow = this.high >> 8
temp := (this.low / this.underflow) >> 0
// Calculate the byte to output.
// There is a special case for 255.
var result uint32
if temp>>8 != 0 {
// Special case.
// Output 255.
result = 255
} else {
// General case.
// Output the byte that has been calculated.
result = temp
}
// Output the decoded byte into the output buffer.
this.output = append(this.output, result)
// Update the internal state variables base on the byte that was decoded.
this.update(result)
}
// Renormalizing is the tricky but boring part of coding.
// The purpose of renormalizing is to allow the computation of an arbitrary
// precision fraction using only 32 bits of space.
// In the range coding variant of arithmetic coding implemented here,
// renormalization happens at bytes instead of bits. This means that it
// happens less frequently and so is faster to compute.
func (this *Decoder) renormalize() {
// Renormalization clears bits out of the working area to make room for
// more bits for computation. Continue until the working area is clear.
for this.high <= BOTTOM_VALUE {
// More the high bits over to make room.
// This might have caused the sign bit to be set, so coerce from a float
// to a 32-bit unsigned int.
this.high = (this.high << 8) >> 0
// Shift the low end of the range over to make room.
// Shift the working byte and move it into the low end of the range.
this.low = (this.low << 8) | ((this.working << EXTRA_BITS) & 0xFF)
// Obtain new bits to decode if there are any in the input buffer.
// There is a special case when the input buffer is empty.
if len(this.input) == 0 {
// Special case. The input buffer is empty.
// This will only be called for flushing the internal state variables.
this.working = 0
} else {
// General case. There input buffer has bits that have not been decoded.
// Put them in the working byte.
this.working = this.input[0]
this.input = this.input[1:]
}
// Load the bits from the new working byte into the low end of the range.
// Be careful not to overwrite the bits we stored in there from the old
// working byte.
this.low = (this.low | (this.working >> (8 - EXTRA_BITS)))
// Coerce the low end of the range from a float to a 32-bit unsigned int.
this.low = this.low >> 0
}
}
// Update internal state variables based on the symbol that was last decoded.
func (this *Decoder) update(symbol uint32) {
// Look up the corresponding interval for the symbol in the coding table.
// This is what we actually use for encoding.
interval := this.intervals[uint8(symbol)]
// Recover the bits stored from the underflow
// This will be 0 if there are no underflow bits.
temp := this.underflow * interval.low
// Adjust the low value to account for underflow.
// There is no adjustment if there are no underflow bits.
this.low = this.low - temp
// The case where the symbol being encoded has the highest range is a
// special case.
if interval.high >= this.total {
// Special case where the symbol being encoded has the highest range
// Adjust the high part of the range
this.high = this.high - temp
} else {
// General case
// Adjust the high part of the range
this.high = this.underflow * interval.length
}
}
// Get the remaining information from the internal state variables and
// write it to the output buffer.
// This should be called after the input buffer is empty.
func (this *Decoder) flush() {
// Attempt to decode a symbol even though the input buffer is empty.
// This should get the remaining state out of working.
this.decodeSymbol()
// Renormalize. This should get the remaining state out of the rest of the
// internal state variables.
this.renormalize()
}