forked from diskoverdata/diskover-community
-
Notifications
You must be signed in to change notification settings - Fork 0
/
diskover_s3.py
executable file
·533 lines (481 loc) · 17.5 KB
/
diskover_s3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""diskover - Elasticsearch file system crawler
diskover is a file system crawler that index's
your file metadata into Elasticsearch.
See README.md or https://github.com/shirosaidev/diskover
for more information.
Amazon S3 inventory module for diskover
Copyright (C) Chris Park 2017-2018
diskover is released under the Apache 2.0 license. See
LICENSE for the full license text.
"""
import os
import gzip
import csv
from datetime import datetime
import time
import hashlib
try:
from Queue import Queue as pyQueue
except ImportError:
from queue import Queue as pyQueue
from threading import Thread, RLock
from diskover import config, plugins, progress_bar
from diskover_bot_module import get_worker_name, auto_tag, es_bulk_add, file_excluded
fake_dirs = []
buckets = []
workername = get_worker_name()
# create queue and threads for bulk adding to ES
s3queue = pyQueue()
s3threadlock = RLock()
def process_line(row, tree_dirs, tree_files, cliargs):
global fake_dirs
n = 2
# S3 Inventory csv column headers
inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]}
try:
inventory_dict['s3_size'] = int(row[n])
n = n + 1
except IndexError:
pass
try:
inventory_dict['s3_last_modified_date'] = row[n]
n = n + 1
except IndexError:
pass
try:
inventory_dict['s3_etag'] = row[n]
n = n + 1
except IndexError:
pass
try:
inventory_dict['s3_storage_class'] = row[n]
n = n + 1
except IndexError:
pass
try:
inventory_dict['s3_multipart_upload'] = row[n]
n = n + 1
except IndexError:
pass
try:
inventory_dict['s3_replication_status'] = row[n]
n = n + 1
except IndexError:
pass
try:
inventory_dict['s3_encryption_status'] = row[n]
except IndexError:
pass
# prepare inventory dict for diskover index
# fake path /s3/bucketname/key
bucket = '/s3/' + row[0] + '/'
path = os.path.join(bucket, inventory_dict['s3_key'])
# check if directory
if path.endswith('/'):
isdir = True
path = path.rstrip('/')
s3threadlock.acquire()
fake_dirs.append(path)
s3threadlock.release()
else:
isdir = False
# add any directories in path to fake dirs
splitpath = inventory_dict['s3_key'].split('/')
# remove file name
splitpath = splitpath[:-1]
prev_path = bucket.rstrip('/')
for p in splitpath:
# create fake directory entry
s3threadlock.acquire()
dir_dict = make_fake_s3_dir(prev_path, p, cliargs)
s3threadlock.release()
current_path = os.path.join(prev_path, p)
if dir_dict is None:
prev_path = current_path
continue
tree_dirs.append(dir_dict)
# increment items counts of parentdir
for d in tree_dirs:
if d['filename'] == os.path.basename(dir_dict['path_parent']):
d['items_subdirs'] += 1
d['items'] += 1
break
prev_path = current_path
size = inventory_dict['s3_size']
# filename
filename = os.path.basename(path)
# check if file is in exluded_files list
extension = os.path.splitext(filename)[1][1:].strip().lower()
if file_excluded(filename, extension, path, cliargs['verbose']):
return tree_dirs, tree_files
# Skip files smaller than minsize cli flag
if not isdir and size < cliargs['minsize']:
return tree_dirs, tree_files
# modified time
mtime_utc = inventory_dict['s3_last_modified_date'].partition('.')[0]
# modified time in unix
mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
# get time
indextime_utc = datetime.utcnow().isoformat()
# get absolute path of parent directory
parentdir = os.path.abspath(os.path.join(path, os.pardir))
# absolute full path
fullpath = os.path.abspath(os.path.join(parentdir, filename))
# remove any keys (fields) we don't want to add to ES
inventory_dict.pop('s3_size', None)
inventory_dict.pop('s3_last_modified_date', None)
if isdir: # directory
inventory_dict['filename'] = filename
inventory_dict['path_parent'] = parentdir
inventory_dict["filesize"] = 0
inventory_dict["items"] = 1 # 1 for itself
inventory_dict["items_files"] = 0
inventory_dict["items_subdirs"] = 0
inventory_dict["last_modified"] = mtime_utc
inventory_dict["tag"] = ""
inventory_dict["tag_custom"] = ""
inventory_dict["indexing_date"] = indextime_utc
inventory_dict["worker_name"] = workername
inventory_dict["change_percent_filesize"] = ""
inventory_dict["change_percent_items"] = ""
inventory_dict["change_percent_items_files"] = ""
inventory_dict["change_percent_items_subdirs"] = ""
inventory_dict["_type"] = "directory"
# increment items counts of parentdir
for d in tree_dirs:
if d['filename'] == os.path.basename(parentdir):
d['items_subdirs'] += 1
d['items'] += 1
break
# add any autotags to inventory_dict
if cliargs['autotag'] and len(config['autotag_dirs']) > 0:
auto_tag(inventory_dict, 'directory', mtime_unix, None, None)
# check plugins for adding extra meta data to dirmeta_dict
for plugin in plugins:
try:
# check if plugin is for directory doc
mappings = {'mappings': {'directory': {'properties': {}}}}
plugin.add_mappings(mappings)
inventory_dict.update(plugin.add_meta(fullpath))
except KeyError:
pass
tree_dirs.append(inventory_dict)
else: # file
# Convert time in days (mtime cli arg) to seconds
time_sec = cliargs['mtime'] * 86400
file_mtime_sec = time.time() - mtime_unix
# Only process files modified at least x days ago
if file_mtime_sec < time_sec:
return tree_files, tree_dirs
# create md5 hash of file using metadata filesize and mtime
filestring = str(size) + str(mtime_unix)
filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()
inventory_dict['filename'] = filename
inventory_dict['path_parent'] = parentdir
inventory_dict["extension"] = extension
inventory_dict["filesize"] = size
inventory_dict["last_modified"] = mtime_utc
inventory_dict["filehash"] = filehash
inventory_dict["tag"] = ""
inventory_dict["tag_custom"] = ""
inventory_dict["dupe_md5"] = ""
inventory_dict["indexing_date"] = indextime_utc
inventory_dict["worker_name"] = workername
inventory_dict["_type"] = "file"
# add file size and increment items counts to parentdir
for d in tree_dirs:
if d['filename'] == os.path.basename(parentdir):
d['filesize'] += size
d['items_files'] += 1
d['items'] += 1
break
# check plugins for adding extra meta data to inventory_dict
for plugin in plugins:
try:
# check if plugin is for file doc
mappings = {'mappings': {'file': {'properties': {}}}}
plugin.add_mappings(mappings)
inventory_dict.update(plugin.add_meta(fullpath))
except KeyError:
pass
# add any autotags to inventory_dict
if cliargs['autotag'] and len(config['autotag_files']) > 0:
auto_tag(inventory_dict, 'file', mtime_unix, None, None)
tree_files.append(inventory_dict)
return tree_dirs, tree_files
def process_s3_inventory(inventory_file, cliargs):
"""Process s3 inventory function.
Takes an S3 inventory file (gzipped csv), processes and bulk adds it
into diskover index.
"""
global buckets
tree_dirs = []
tree_files = []
with gzip.open(inventory_file, mode='rt') as f:
reader = csv.reader(f, delimiter=',', quotechar='"')
l = 1
for row in reader:
# get bucket name from first line of inventory file
if l == 1:
# add fake root /s3 directory entry to list
if "/s3" not in buckets:
s3threadlock.acquire()
buckets.append("/s3")
# create fake root /s3/bucketname directory entry for s3 bucket
root_dict = make_fake_s3_dir('/', 's3', cliargs)
s3threadlock.release()
# check if bucket fake dir already created
if root_dict:
tree_dirs.append(root_dict)
# add fake root /s3/bucketname directory entry for s3 bucket to list
bucket_path = os.path.abspath(os.path.join('/s3', row[0]))
if bucket_path not in buckets:
s3threadlock.acquire()
buckets.append(bucket_path)
# create fake root /s3/bucketname directory entry for s3 bucket
root_dict = make_fake_s3_dir('/s3', row[0], cliargs)
s3threadlock.release()
# check if bucket fake dir already created
if root_dict:
tree_dirs.append(root_dict)
tree_dirs, tree_files = process_line(row, tree_dirs, tree_files, cliargs)
l += 1
if len(tree_dirs) + len(tree_files) > 0:
es_bulk_add(workername, tree_dirs, tree_files, cliargs, 0)
def make_fake_s3_dir(parent, file, cliargs):
"""Make fake s3 directory function.
Creates a fake directory doc for es.
Returns dictionary for directory doc.
"""
global fake_dirs
fullpath = os.path.abspath(os.path.join(parent, file))
if fullpath in fake_dirs:
return None
mtime_utc = "1970-01-01T00:00:00"
mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
dir_dict = {}
dir_dict['filename'] = file
dir_dict['path_parent'] = parent
dir_dict["filesize"] = 0
dir_dict["items"] = 1 # 1 for itself
dir_dict["items_files"] = 0
dir_dict["items_subdirs"] = 0
dir_dict["last_modified"] = mtime_utc
dir_dict["tag"] = ""
dir_dict["tag_custom"] = ""
dir_dict["indexing_date"] = datetime.utcnow().isoformat()
dir_dict["worker_name"] = workername
dir_dict["change_percent_filesize"] = ""
dir_dict["change_percent_items"] = ""
dir_dict["change_percent_items_files"] = ""
dir_dict["change_percent_items_subdirs"] = ""
dir_dict["_type"] = "directory"
# add any autotags to inventory_dict
if cliargs['autotag'] and len(config['autotag_dirs']) > 0:
auto_tag(dir_dict, 'directory', mtime_unix, None, None)
# check plugins for adding extra meta data to dirmeta_dict
for plugin in plugins:
try:
# check if plugin is for directory doc
mappings = {'mappings': {'directory': {'properties': {}}}}
plugin.add_mappings(mappings)
dir_dict.update(plugin.add_meta(fullpath))
except KeyError:
pass
# store in fake_dirs
s3threadlock.acquire()
fake_dirs.append(fullpath)
s3threadlock.release()
return dir_dict
def get_s3_mappings(config):
mappings = {
"settings": {
"index" : {
"number_of_shards": config['index_shards'],
"number_of_replicas": config['index_replicas']
}
},
"mappings": {
"directory": {
"properties": {
"s3_bucket": {
"type": "keyword"
},
"s3_key": {
"type": "keyword"
},
"s3_etag": {
"type": "keyword"
},
"s3_storage_class": {
"type": "keyword"
},
"s3_multipart_upload": {
"type": "boolean"
},
"s3_replication_status": {
"type": "keyword"
},
"s3_encryption_status": {
"type": "keyword"
},
"filename": {
"type": "keyword"
},
"path_parent": {
"type": "keyword"
},
"filesize": {
"type": "long"
},
"items": {
"type": "long"
},
"items_files": {
"type": "long"
},
"items_subdirs": {
"type": "long"
},
"last_modified": {
"type": "date"
},
"tag": {
"type": "keyword"
},
"tag_custom": {
"type": "keyword"
},
"indexing_date": {
"type": "date"
},
"worker_name": {
"type": "keyword"
},
"change_percent_filesize": {
"type": "float"
},
"change_percent_items": {
"type": "float"
},
"change_percent_items_files": {
"type": "float"
},
"change_percent_items_subdirs": {
"type": "float"
}
}
},
"file": {
"properties": {
"s3_bucket": {
"type": "keyword"
},
"s3_key": {
"type": "keyword"
},
"s3_etag": {
"type": "keyword"
},
"s3_storage_class": {
"type": "keyword"
},
"s3_multipart_upload": {
"type": "boolean"
},
"s3_replication_status": {
"type": "keyword"
},
"s3_encryption_status": {
"type": "keyword"
},
"filename": {
"type": "keyword"
},
"extension": {
"type": "keyword"
},
"path_parent": {
"type": "keyword"
},
"filesize": {
"type": "long"
},
"last_modified": {
"type": "date"
},
"filehash": {
"type": "keyword"
},
"tag": {
"type": "keyword"
},
"tag_custom": {
"type": "keyword"
},
"dupe_md5": {
"type": "keyword"
},
"indexing_date": {
"type": "date"
},
"worker_name": {
"type": "keyword"
}
}
}
}
}
return mappings
def csv_file_reader(q):
"""s3 inventory file reader thread function.
"""
while True:
item = q.get()
inventory_file, cliargs = item
process_s3_inventory(inventory_file, cliargs)
q.task_done()
def start_importing(es, cliargs, logger):
"""Start importing s3 inventory file function.
"""
for i in range(4):
thread = Thread(target=csv_file_reader, args=(s3queue,))
thread.daemon = True
thread.start()
# start importing S3 inventory file(s)
inventory_files = cliargs['s3']
logger.info('Importing %s S3 inventory file(s)...' % len(inventory_files))
# add fake disk space to index with path set to /s3
data = {
"path": '/s3',
"total": 0,
"used": 0,
"free": 0,
"available": 0,
"indexing_date": datetime.utcnow().isoformat()
}
es.index(index=cliargs['index'], doc_type='diskspace', body=data)
# add all s3 inventory files to queue
for file in inventory_files:
s3queue.put((file, cliargs))
# set up progress bar
bar = progress_bar('Importing')
bar.start()
if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
i = 1
while s3queue.qsize() > 0:
try:
percent = int("{0:.0f}".format(100 * ((len(inventory_files) - s3queue.qsize())
/ float(len(inventory_files)))))
bar.update(percent)
except ZeroDivisionError:
bar.update(0)
except ValueError:
bar.update(0)
time.sleep(.5)
i += 1
bar.finish()
# wait for queue to be empty
s3queue.join()