diff --git a/go.sum b/go.sum index 79b4957..e69de29 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +0,0 @@ -github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= -github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= -golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/xelishash.go b/xelishash.go index 69a3adb..37ea643 100644 --- a/xelishash.go +++ b/xelishash.go @@ -75,9 +75,15 @@ func XelisHash(input []byte, scratch_pad *ScratchPad) ([32]byte, error) { for i := 0; i < ITERS; i++ { for j := 0; j < len(small_pad)/SLOT_LENGTH; j++ { - // Initialize indices + // Initialize indices and precompute the total sum of small pad + var total_sum uint32 = 0 for k := 0; k < SLOT_LENGTH; k++ { indices[k] = uint16(k) + if slots[k]>>31 == 0 { + total_sum += small_pad[j*SLOT_LENGTH+k] + } else { + total_sum -= small_pad[j*SLOT_LENGTH+k] + } } for slot_idx := SLOT_LENGTH - 1; slot_idx >= 0; slot_idx-- { @@ -86,29 +92,21 @@ func XelisHash(input []byte, scratch_pad *ScratchPad) ([32]byte, error) { index := int(indices[index_in_indices]) indices[index_in_indices] = indices[slot_idx] - // THIS IS THE MOST PERFORMANCE-CRITICAL SECTION - - // Split the loop in two to avoid checking k == index - sum := slots[index] - offset := j * SLOT_LENGTH - for k := 0; k < index; k++ { - pad := small_pad[offset+k] - if slots[k]>>31 == 0 { - sum = sum + pad - } else { - sum = sum - pad - } - } - for k := index + 1; k < SLOT_LENGTH; k++ { - pad := small_pad[offset+k] - if slots[k]>>31 == 0 { - sum = sum + pad - } else { - sum = sum - pad - } + local_sum := total_sum + s1 := int32(slots[index] >> 31) + pad_value := small_pad[j*SLOT_LENGTH+index] + if s1 == 0 { + local_sum -= pad_value + } else { + local_sum += pad_value } - slots[index] = sum + // Apply the sum to the slot + slots[index] += local_sum + + // Update the total sum + s2 := int32(slots[index] >> 31) + total_sum -= 2 * small_pad[j*SLOT_LENGTH+index] * uint32(-s1+s2) } } }