Skip to content

Commit

Permalink
RFC: add optional persistence to the lookup table
Browse files Browse the repository at this point in the history
Signed-off-by: Uri Okrent <[email protected]>
  • Loading branch information
ugtar committed Oct 16, 2018
1 parent 914bda4 commit 40f74d3
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 0 deletions.
10 changes: 10 additions & 0 deletions scrubadub/filth/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pickle

from .. import exceptions
from .. import utils

Expand Down Expand Up @@ -36,6 +38,14 @@ def identifier(self):
i = self.lookup[(self.type, self.text.lower())]
return u'%s-%d' % (self.placeholder, i)

@classmethod
def switch_to_persistent_lookup(cls, path):
try:
with open(path, 'rb') as f:
cls.lookup = pickle.load(f)
except OSError:
cls.lookup = utils.PersistentLookup(path)

def replace_with(self, replace_with='placeholder', **kwargs):
if replace_with == 'placeholder':
return self.prefix + self.placeholder + self.suffix
Expand Down
6 changes: 6 additions & 0 deletions scrubadub/scrubbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,9 @@ def iter_filth(self, text):
else:
filth = filth.merge(next_filth)
yield filth

def persist_identifiers(self, path):
Filth.switch_to_persistent_lookup(path)

def save_identifiers(self):
Filth.lookup.save()
30 changes: 30 additions & 0 deletions scrubadub/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import pickle
import uuid
import hashlib

try:
unicode
except NameError:
Expand Down Expand Up @@ -58,3 +62,29 @@ def __getitem__(self, key):
except KeyError:
self.table[key] = len(self.table)
return self.table[key]


class PersistentLookup(Lookup):
"""PersistentLookup persists the reference table for ``Filth``
identifiers so that they remain consistent across runs.
For security the PersistentLookup is slower since it must obfuscate
the keys so they cannot be reversed.
"""

def __init__(self, path):
self.path = path
self.salt = uuid.uuid4().bytes
return super().__init__()

def obfuscate_key(self, key):
key = (key[0],
hashlib.sha1(key[1].encode('utf-8') + self.salt).hexdigest())
return key

def __getitem__(self, key):
return super().__getitem__(self.obfuscate_key(key))

def save(self):
with open(self.path, 'wb') as f:
pickle.dump(self, f)

0 comments on commit 40f74d3

Please sign in to comment.