diff --git a/algorithm.md b/algorithm.md index 2e646f3..e7b8773 100644 --- a/algorithm.md +++ b/algorithm.md @@ -19,25 +19,28 @@ This is done in two phases: imohash takes two parameters, as well as the message length: -* sample size (s) -* sampling threshold (t) -* message length (L) +- sample size (s) +- sampling threshold (t) +- message length (L) -There are two mode of operation: **sampled** and **full**. Mode is -determined as follows: +There are two mode of operation: **sampled** and **full**. + +**Full** mode is a single hash of the entire message. While sampling is the key point of imohash, sometimes it doesn't make sense and a full hash is used. It is used when the message length is less than the sampling threshold, or is less than twice the sample size - 1 (in order to sample from the middle of the message). **Full** mode is also used when the sample size parameter is less than 1. + +In all other cases **sampled** mode is used. Summarized: ``` -if (s > 0) && (t > 0) && (L > t) && (t > 2s) - mode = sampled -else +if (s < 1) || (L < t) || (L < (2s - 1)) mode = full +else + mode = sampled ``` ### Hash calculation The core hashing routine uses [MurmurHash3](https://code.google.com/p/smhasher/wiki/MurmurHash3) in a 128-bit configuration. -Hashing in *Full* mode is identical to passing the entire -message to Murmhash3. *Sampled* mode constructs a new message using +Hashing in **Full** mode is identical to passing the entire +message to Murmhash3. **Sampled** mode constructs a new message using three samples from the original: Message M of length L is an array of bytes, M[0]...M[L-1]. If @@ -51,6 +54,7 @@ S2 = M[L-s:L-1] h' = Murmur3(concat(S0, S1, S2)) ``` + ### Size injection Size is inserted into the hash directly. This means that two files @@ -119,8 +123,8 @@ threshold t. {16384, 131073, 131072, "808008282d3f3b53e1fd132cc51fcc1d"}, {16384, 131072, 500000, "a0c21e44a0ba3bddee802a9d1c5332ca"}, {50, 131072, 300000, "e0a712edd8815c606344aed13c44adcf"}, +{0, 100, 1000, "e80753211a57ee0de67c756e98e00496"}, +{50, 9999, 1000, "e80753211a57ee0de67c756e98e00496"}, +{501, 20, 1000, "e80753211a57ee0de67c756e98e00496"}, +{501, 20, 1001, "e9079899cffb46f60c8645a01f12f9c9"}, ``` - - - - diff --git a/imohash.go b/imohash.go index 655a755..5203c07 100644 --- a/imohash.go +++ b/imohash.go @@ -25,7 +25,6 @@ type ImoHash struct { hasher murmur3.Hash128 sampleSize int sampleThreshold int - bytesAdded int } // New returns a new ImoHash using the default sample size @@ -79,11 +78,10 @@ func (imo *ImoHash) Sum(data []byte) [Size]byte { // SumFile hashes a file using using the ImoHash parameters. func (imo *ImoHash) SumFile(filename string) ([Size]byte, error) { f, err := os.Open(filename) - defer f.Close() - if err != nil { return emptyArray, err } + defer f.Close() fi, err := f.Stat() if err != nil { @@ -104,27 +102,30 @@ func (imo *ImoHash) hashCore(f *io.SectionReader) ([Size]byte, error) { imo.hasher.Reset() - if f.Size() < int64(imo.sampleThreshold) || imo.sampleSize < 1 { + msgLen := f.Size() + if imo.sampleSize < 1 || + msgLen < int64(imo.sampleThreshold) || + msgLen < int64(2*imo.sampleSize-1) { if _, err := io.Copy(imo.hasher, f); err != nil { return emptyArray, err } } else { buffer := make([]byte, imo.sampleSize) - if _, err := f.Read(buffer); err != nil { + if _, err := io.ReadFull(f, buffer); err != nil { return emptyArray, err } imo.hasher.Write(buffer) // these Writes never fail if _, err := f.Seek(f.Size()/2, 0); err != nil { return emptyArray, err } - if _, err := f.Read(buffer); err != nil { + if _, err := io.ReadFull(f, buffer); err != nil { return emptyArray, err } imo.hasher.Write(buffer) if _, err := f.Seek(int64(-imo.sampleSize), 2); err != nil { return emptyArray, err } - if _, err := f.Read(buffer); err != nil { + if _, err := io.ReadFull(f, buffer); err != nil { return emptyArray, err } imo.hasher.Write(buffer) diff --git a/spec_test.go b/spec_test.go index e46909d..4b903bb 100644 --- a/spec_test.go +++ b/spec_test.go @@ -25,12 +25,17 @@ func TestSpec(t *testing.T) { {16384, 131073, 131072, "808008282d3f3b53e1fd132cc51fcc1d"}, {16384, 131072, 500000, "a0c21e44a0ba3bddee802a9d1c5332ca"}, {50, 131072, 300000, "e0a712edd8815c606344aed13c44adcf"}, + + {0, 100, 1000, "e80753211a57ee0de67c756e98e00496"}, + {50, 9999, 1000, "e80753211a57ee0de67c756e98e00496"}, + {501, 20, 1000, "e80753211a57ee0de67c756e98e00496"}, + {501, 20, 1001, "e9079899cffb46f60c8645a01f12f9c9"}, } for _, test := range tests { i := NewCustom(test.s, test.t) hashStr = fmt.Sprintf("%x", i.Sum(M(test.n))) - equal(t, hashStr, test.hash) + equal(t, test.hash, hashStr) } }