Skip to content

Commit

Permalink
Handle case when file size is too small for sample size
Browse files Browse the repository at this point in the history
This fixes the case where hashing may crash when using custom parameters
if the sample size is too large for the file.

- Update the spec and code to correctly constraint mode based on file size
- Update tests
- Minor lint fixes
  • Loading branch information
kalafut committed Aug 26, 2024
1 parent 6312005 commit a00abe6
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 22 deletions.
32 changes: 18 additions & 14 deletions algorithm.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,28 @@ This is done in two phases:

imohash takes two parameters, as well as the message length:

* sample size (s)
* sampling threshold (t)
* message length (L)
- sample size (s)
- sampling threshold (t)
- message length (L)

There are two mode of operation: **sampled** and **full**. Mode is
determined as follows:
There are two mode of operation: **sampled** and **full**.

**Full** mode is a single hash of the entire message. While sampling is the key point of imohash, sometimes it doesn't make sense and a full hash is used. It is used when the message length is less than the sampling threshold, or is less than twice the sample size - 1 (in order to sample from the middle of the message). **Full** mode is also used when the sample size parameter is less than 1.

In all other cases **sampled** mode is used. Summarized:

```
if (s > 0) && (t > 0) && (L > t) && (t > 2s)
mode = sampled
else
if (s < 1) || (L < t) || (L < (2s - 1))
mode = full
else
mode = sampled
```

### Hash calculation

The core hashing routine uses [MurmurHash3](https://code.google.com/p/smhasher/wiki/MurmurHash3) in a 128-bit configuration.
Hashing in *Full* mode is identical to passing the entire
message to Murmhash3. *Sampled* mode constructs a new message using
Hashing in **Full** mode is identical to passing the entire
message to Murmhash3. **Sampled** mode constructs a new message using
three samples from the original:

Message M of length L is an array of bytes, M[0]...M[L-1]. If
Expand All @@ -51,6 +54,7 @@ S2 = M[L-s:L-1]
h' = Murmur3(concat(S0, S1, S2))
```

### Size injection

Size is inserted into the hash directly. This means that two files
Expand Down Expand Up @@ -119,8 +123,8 @@ threshold t.
{16384, 131073, 131072, "808008282d3f3b53e1fd132cc51fcc1d"},
{16384, 131072, 500000, "a0c21e44a0ba3bddee802a9d1c5332ca"},
{50, 131072, 300000, "e0a712edd8815c606344aed13c44adcf"},
{0, 100, 1000, "e80753211a57ee0de67c756e98e00496"},
{50, 9999, 1000, "e80753211a57ee0de67c756e98e00496"},
{501, 20, 1000, "e80753211a57ee0de67c756e98e00496"},
{501, 20, 1001, "e9079899cffb46f60c8645a01f12f9c9"},
```




15 changes: 8 additions & 7 deletions imohash.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ type ImoHash struct {
hasher murmur3.Hash128
sampleSize int
sampleThreshold int
bytesAdded int
}

// New returns a new ImoHash using the default sample size
Expand Down Expand Up @@ -79,11 +78,10 @@ func (imo *ImoHash) Sum(data []byte) [Size]byte {
// SumFile hashes a file using using the ImoHash parameters.
func (imo *ImoHash) SumFile(filename string) ([Size]byte, error) {
f, err := os.Open(filename)
defer f.Close()

if err != nil {
return emptyArray, err
}
defer f.Close()

fi, err := f.Stat()
if err != nil {
Expand All @@ -104,27 +102,30 @@ func (imo *ImoHash) hashCore(f *io.SectionReader) ([Size]byte, error) {

imo.hasher.Reset()

if f.Size() < int64(imo.sampleThreshold) || imo.sampleSize < 1 {
msgLen := f.Size()
if imo.sampleSize < 1 ||
msgLen < int64(imo.sampleThreshold) ||
msgLen < int64(2*imo.sampleSize-1) {
if _, err := io.Copy(imo.hasher, f); err != nil {
return emptyArray, err
}
} else {
buffer := make([]byte, imo.sampleSize)
if _, err := f.Read(buffer); err != nil {
if _, err := io.ReadFull(f, buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer) // these Writes never fail
if _, err := f.Seek(f.Size()/2, 0); err != nil {
return emptyArray, err
}
if _, err := f.Read(buffer); err != nil {
if _, err := io.ReadFull(f, buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer)
if _, err := f.Seek(int64(-imo.sampleSize), 2); err != nil {
return emptyArray, err
}
if _, err := f.Read(buffer); err != nil {
if _, err := io.ReadFull(f, buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer)
Expand Down
7 changes: 6 additions & 1 deletion spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,17 @@ func TestSpec(t *testing.T) {
{16384, 131073, 131072, "808008282d3f3b53e1fd132cc51fcc1d"},
{16384, 131072, 500000, "a0c21e44a0ba3bddee802a9d1c5332ca"},
{50, 131072, 300000, "e0a712edd8815c606344aed13c44adcf"},

{0, 100, 1000, "e80753211a57ee0de67c756e98e00496"},
{50, 9999, 1000, "e80753211a57ee0de67c756e98e00496"},
{501, 20, 1000, "e80753211a57ee0de67c756e98e00496"},
{501, 20, 1001, "e9079899cffb46f60c8645a01f12f9c9"},
}

for _, test := range tests {
i := NewCustom(test.s, test.t)
hashStr = fmt.Sprintf("%x", i.Sum(M(test.n)))
equal(t, hashStr, test.hash)
equal(t, test.hash, hashStr)
}
}

Expand Down

0 comments on commit a00abe6

Please sign in to comment.