-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathiscc.go
325 lines (272 loc) · 8.08 KB
/
iscc.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
package iscc
import (
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"github.com/OneOfOne/xxhash"
"github.com/coblo/iscc-golang/packages/base58"
"github.com/coblo/iscc-golang/packages/cdc"
"github.com/coblo/iscc-golang/packages/hashes"
"github.com/pkg/errors"
"image"
_ "image/jpeg"
_ "image/png"
"io"
"strings"
)
type ISCC struct {
Meta [11]byte
Partial bool
Gmt int
Content [11]byte
Data [11]byte
Instance [11]byte
}
const (
INPUT_TRIM = 128
WINDOW_SIZE_MID = 4
WINDOW_SIZE_CID_T = 5
HEAD_MID byte = '\x00'
HEAD_CID_T byte = '\x10'
HEAD_CID_T_PCF byte = '\x11'
HEAD_CID_I byte = '\x12'
HEAD_CID_I_PCF byte = '\x13'
HEAD_CID_A = '\x14'
HEAD_CID_A_PCF = '\x15'
HEAD_CID_V = '\x16'
HEAD_CID_V_PCF = '\x17'
HEAD_CID_M = '\x18'
HEAD_CID_M_PCF = '\x19'
HEAD_DID byte = '\x20'
HEAD_IID byte = '\x30'
)
func MetaId(title, extra string, version int) (metaId, processedTitle, processedExtra string, err error) {
// 1. verify version is supported
if version != 1 {
return "", "", "", errors.New("Only version 1 is supported")
}
// 2. & 3. Pre normalization & trimming
processedTitle = textTrim(textPreNormalize(title))
processedExtra = textTrim(textPreNormalize(extra))
// 4. Concatenate
concat := strings.TrimSpace(processedTitle + "\u0020" + processedExtra)
// 5. Normalization
normalized := textNormalize(concat)
// 6. Create list of n-grams
nGramWindows, err := createNGramWindowsLetterWise(normalized, WINDOW_SIZE_MID)
if err != nil {
return
}
// 7. create xxhash64 digest
hash := xxhash.New64()
hashDigests := make([][]byte, len(nGramWindows))
for i, window := range nGramWindows {
hash.Write(window)
hashDigests[i] = hash.Sum(nil)
hash.Reset()
}
// 8. Apply similarity hash
simhashDigest, err := hashes.SimilarityHash(hashDigests)
if err != nil {
return
}
// 9. prepend header-byte
meta_id_digest := append([]byte{HEAD_MID}, simhashDigest...)
// 10. encode with base58-iscc
metaId, err = base58.Encode(meta_id_digest)
// 11. Return encoded Meta-ID, trimmed `title` and trimmed `extra` data.
return
}
func ContentIdText(text string, partial bool) (string, error) {
// 1. & 2. Pre-normalize and normalize
text = textNormalize(textPreNormalize(text))
// 3. Split to words
w := strings.Split(text, " ")
// 4. create 5 word shingles
wordNGrams, err := createNGramWindowsWordWise(w, WINDOW_SIZE_CID_T)
if err != nil {
return "", err
}
shingles := make([]string, len(wordNGrams))
for i, words := range wordNGrams {
shingles[i] = strings.Join(words, "\u0020")
}
// 5. create 32-bit features with xxHash32
features := make([]uint32, len(shingles))
for i, window := range shingles {
features[i] = xxhash.Checksum32([]byte(window))
}
// 6. Apply minimum-hash
mHash := hashes.MinHash(features)
// 7. & 8 Collect least significant bits and create 64-bit digests
lsb := getLSBDigests(mHash)
// 9. Apply simhash to digests
simhashDigest, err := hashes.SimilarityHash(lsb)
// 10. & 11. prepend component header, encode and return
if partial {
return base58.Encode(append([]byte{HEAD_CID_T_PCF}, simhashDigest...))
} else {
return base58.Encode(append([]byte{HEAD_CID_T}, simhashDigest...))
}
}
func ContentIdImage(img image.Image, partial bool) (contentId string, err error) {
// 1. Normalize image to 2-dimensional pixel array
grayImage, err := imageNormalize(img)
// 2. Calculate image hash
hashDigest := hashes.ImageHash(*grayImage)
contentIdImage := make([]byte, 8)
binary.BigEndian.PutUint64(contentIdImage, hashDigest)
// 3. Prepend the 1-byte component header
if partial {
contentIdImage = append([]byte{HEAD_CID_I_PCF}, contentIdImage...)
} else {
contentIdImage = append([]byte{HEAD_CID_I}, contentIdImage...)
}
// 4. Encode and return
return base58.Encode(contentIdImage)
}
func ContentIdImageFromFile(reader io.Reader, partial bool) (contentId string, err error) {
img, _, err := image.Decode(reader)
if err != nil {
return
}
return ContentIdImage(img, partial)
}
func ContentIdMixed(cids []string, partial bool) (string, error) {
// 1. Decode CIDs
decoded := make([][]byte, len(cids))
var err error
for i := range decoded {
decoded[i], err = base58.Decode(cids[i])
if err != nil {
return "", err
}
}
// 2. Extract first 8-bytes
for i := range decoded {
decoded[i] = decoded[i][:8]
}
// 3. Apply Similarity hash
simhashDigest, err := hashes.SimilarityHash(decoded)
if err != nil {
return "", err
}
// 4. & 5. Prepend component header, encode and return
if partial {
return base58.Encode(append([]byte{HEAD_CID_M_PCF}, simhashDigest...))
} else {
return base58.Encode(append([]byte{HEAD_CID_M}, simhashDigest...))
}
}
func DataId(r io.Reader) (string, error) {
// 1 & 2. xxHash32 over CDC
features := cdc.GetHashedCDC(r)
// 3. Apply minimum hash
mhash := hashes.MinHash(features)
// 4. & 5. Collect lsb and create 64-bit digests
lsb := getLSBDigests(mhash)
// 6. Apply simhash
simHash, err := hashes.SimilarityHash(lsb)
if err != nil {
return "", err
}
// 7. Prepend 1-byte header
data_id_digest := append([]byte{HEAD_DID}, simHash...)
// 8. encode and return
return base58.Encode(data_id_digest)
}
func InstanceId(r io.Reader) (code string, hex_hash string) {
buffer := make([]byte, 64000)
var leafNodeDigests [][32]byte
// 1. Split int 64 kB chunks
for {
n, _ := r.Read(buffer)
if n == 0 {
break
}
// 2. for each chunk calc sha256d of the concatenation of a 0x00 byte and the chunk
leafNodeDigests = append(leafNodeDigests, doubleSha256(append([]byte{'\x00'}, buffer[:n]...)))
}
// 3. & 4. Apply topHash
topHashDigest := topHash(leafNodeDigests)
// 5. & 6. Trim the tophash to the first 8 bytes and prepend component header
instanceIdDigest := append([]byte{HEAD_IID}, topHashDigest[:8]...)
// 7. encode instance id
code, _ = base58.Encode(instanceIdDigest)
// 8. Hex encode the tophash
hex_hash = hex.EncodeToString(topHashDigest[:])
// 9. return the instance id and the hex encoded tophash
return
}
func createNGramWindowsLetterWise(text string, width int) ([][]byte, error) {
if width < 2 {
return nil, errors.New("Sliding window width must be 2 or bigger")
}
chars := []rune(text)
// if the window width exceeds the string length use only one ngram
if width > len(chars) {
return [][]byte{[]byte(text)}, nil
}
windows := make([][]byte, len(chars)-width+1)
for i := range windows {
windows[i] = []byte(string(chars[i : i+width]))
}
return windows, nil
}
// TODO build interface to combine those 2 methods
func createNGramWindowsWordWise(words []string, width int) ([][]string, error) {
if width < 2 {
return nil, errors.New("Sliding window width must be 2 or bigger")
}
// if the window width exceeds the string length use only one ngram
if width > len(words) {
return [][]string{words}, nil
}
windows := make([][]string, len(words)-width+1)
for i := range windows {
windows[i] = words[i : i+width]
}
return windows, nil
}
func doubleSha256(data []byte) (res [32]byte) {
res = sha256.Sum256(data)
return sha256.Sum256(res[:])
}
func topHash(hashes [][32]byte) [32]byte {
size := len(hashes)
if len(hashes) == 1 {
return hashes[0]
}
pairwiseHashed := make([][32]byte, (size/2 + (size % 2)))
for i := range pairwiseHashed {
pairwiseHashed[i] = hashInnerNodes(hashes[i*2], hashes[(i*2)+1])
}
if size%2 == 1 {
pairwiseHashed[len(pairwiseHashed)-1] = hashInnerNodes(hashes[size-1], hashes[size-1])
}
return topHash(pairwiseHashed)
}
func hashInnerNodes(h1, h2 [32]byte) [32]byte {
concat := make([]byte, 0, 65)
concat = append([]byte{'\x01'}, h1[:]...)
concat = append(concat, h2[:]...)
return doubleSha256(concat)
}
func getLSBDigests(mhash [128]uint32) [][]byte {
var a, b uint64
for i, x := range mhash[:64] {
if (x & 1) == 1 {
a += 1 << uint8(63-i)
}
}
for i, x := range mhash[64:] {
if (x & 1) == 1 {
b += 1 << uint8(63-i)
}
}
aArray := make([]byte, 8)
bArray := make([]byte, 8)
binary.BigEndian.PutUint64(aArray, a)
binary.BigEndian.PutUint64(bArray, b)
return [][]byte{aArray, bArray}
}