forked from markfasheh/duperemove
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hash-tree.h
120 lines (102 loc) · 3.7 KB
/
hash-tree.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
* hash-tree.h
*
* Copyright (C) 2016 SUSE. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef __HASH_TREE__
#define __HASH_TREE__
extern unsigned int blocksize;
struct hash_tree {
struct rb_root root;
struct list_head size_list; /* This is for sorting by dl_num_elem */
uint64_t num_blocks;
uint64_t num_hashes;
};
struct dupe_blocks_list {
struct rb_node dl_node; /* sorted by hash */
struct list_head dl_size_list; /* sorted by dl_num_elem */
unsigned long long dl_num_elem;
struct list_head dl_list;
unsigned int dl_num_files;
struct rb_root dl_files_root; /* stores file_hash_head nodes */
unsigned char dl_hash[DIGEST_LEN_MAX];
};
/* Fiemap flags that would cause us to skip comparison of the block */
#define FIEMAP_SKIP_FLAGS (FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_UNWRITTEN)
#define FILE_BLOCK_SKIP_COMPARE 0x0001
#define FILE_BLOCK_DEDUPED 0x0002
#define FILE_BLOCK_HOLE 0x0004
#define FILE_BLOCK_PARTIAL 0x0008
struct file_block {
struct dupe_blocks_list *b_parent;
struct filerec *b_file;
uint64_t b_loff;
unsigned int b_flags;
/*
* All blocks with this md5. Can be on dupe_blocks_list->dl_list, or
* block_dedupe_list->bd_block_list (see run_dedupe.c).
*/
struct list_head b_list;
struct rb_node b_file_next; /* filerec->block_tree */
struct list_head b_head_list; /* file_hash_head->h_blocks */
};
static inline unsigned long block_len(struct file_block *block)
{
/*
* Avoid storing the length of each block and instead use a
* flag for partial blocks.
*
* NOTE: This only works if we assume that partial blocks are
* at the end of a file
*/
if (block->b_flags & FILE_BLOCK_PARTIAL)
return block->b_file->size % blocksize;
return blocksize;
}
int insert_hashed_block(struct hash_tree *tree, unsigned char *digest,
struct filerec *file, uint64_t loff, unsigned int flags);
int remove_hashed_block(struct hash_tree *tree, struct file_block *block);
/*
* We must call sort_file_hash_heads after inserting blocks into our
* hash_tree. The scan in find_dupes requires them to be in order of
* increasing offset.
*
* NOTE: Using an rbtree instead of doing the list sort winds up in a
* large performance loss. The item walk in lookup_walk_file_hash_head()
* is highly cpu bound so the extra instructions to do a linear tree walk
* really shows up during benchmarking.
*/
void sort_file_hash_heads(struct hash_tree *tree);
void sort_hashes_by_size(struct hash_tree *tree);
/*
* Stores a list of blocks with the same hash / filerec
* combination. Each dupe_blocks_list keeps a tree of these (sorted by
* file).
*
* This speeds up the extent search by allowing us to skip blocks that
* don't belong to the file we are 'walking'. Blocks are inserted into
* h_blocks in the same order they are given to insert_hashed_block()
* (that is, in order of increasing offset).
*/
struct file_hash_head {
struct filerec *h_file;
struct rb_node h_node;
struct list_head h_blocks;
};
int file_in_dups_list(struct dupe_blocks_list *dups, struct filerec *file);
struct file_hash_head *find_file_hash_head(struct dupe_blocks_list *dups,
struct filerec *file);
void init_hash_tree(struct hash_tree *tree);
void free_hash_tree(struct hash_tree *tree);
void debug_print_block(struct file_block *e);
void debug_print_hash_tree(struct hash_tree *tree);
#endif /* __HASH_TREE__ */