diff --git a/src/olympia/blocklist/cron.py b/src/olympia/blocklist/cron.py index 58cd878d08df..0594cbe6379c 100644 --- a/src/olympia/blocklist/cron.py +++ b/src/olympia/blocklist/cron.py @@ -1,4 +1,5 @@ from datetime import datetime +from typing import List import waffle from django_statsd.clients import statsd @@ -13,7 +14,7 @@ from .mlbf import MLBF from .models import Block, BlocklistSubmission, BlockType -from .tasks import cleanup_old_files, process_blocklistsubmission, upload_filter +from .tasks import process_blocklistsubmission, upload_filter from .utils import datetime_to_ts @@ -28,8 +29,8 @@ def get_last_generation_time(): return get_config(MLBF_TIME_CONFIG_KEY, None, json_value=True) -def get_base_generation_time(): - return get_config(MLBF_BASE_ID_CONFIG_KEY, None, json_value=True) +def get_base_generation_time(block_type: BlockType): + return get_config(MLBF_BASE_ID_CONFIG_KEY(block_type), None, json_value=True) def get_blocklist_last_modified_time(): @@ -64,76 +65,78 @@ def _upload_mlbf_to_remote_settings(*, force_base=False): # An add-on version/file from after this time can't be reliably asserted - # there may be false positives or false negatives. # https://github.com/mozilla/addons-server/issues/13695 - generation_time = get_generation_time() - # This timestamp represents the last time the MLBF was generated and uploaded. - # It could have been a base filter or a stash. - last_generation_time = get_last_generation_time() - # This timestamp represents the point in time when - # the base filter was generated and uploaded. - base_generation_time = get_base_generation_time() - - mlbf = MLBF.generate_from_db(generation_time) - - base_filter = ( - MLBF.load_from_storage(base_generation_time) - if base_generation_time is not None - else None + mlbf = MLBF.generate_from_db(get_generation_time()) + previous_filter = MLBF.load_from_storage( + # This timestamp represents the last time the MLBF was generated and uploaded. + # It could have been a base filter or a stash. + get_last_generation_time() ) - previous_filter = ( - # Only load previoous filter if there is a timestamp to use - # and that timestamp is not the same as the base_filter - MLBF.load_from_storage(last_generation_time) - if last_generation_time is not None - and (base_filter is None or base_filter.created_at != last_generation_time) - else base_filter - ) + base_filters_to_update: List[BlockType] = [] + update_stash = False + + # Determine which base filters need to be re uploaded + # and whether the stash needs to be updated + for block_type in BlockType: + # This prevents us from updating a stash or filter based on new soft blocks + # until we are ready to enable soft blocking. + if block_type == BlockType.SOFT_BLOCKED and not waffle.switch_is_active( + 'enable-soft-blocking' + ): + continue + + base_filter = MLBF.load_from_storage(get_base_generation_time(block_type)) + + should_update_filter = ( + force_base + or base_filter is None + or mlbf.blocks_changed_since_previous(block_type, base_filter) + > BASE_REPLACE_THRESHOLD + ) - changes_count = mlbf.blocks_changed_since_previous( - BlockType.BLOCKED, previous_filter - ) - statsd.incr( - 'blocklist.cron.upload_mlbf_to_remote_settings.blocked_changed', changes_count - ) - need_update = ( - force_base - or base_filter is None - or ( - previous_filter is not None - and previous_filter.created_at < get_blocklist_last_modified_time() + should_update_stash = ( + mlbf.blocks_changed_since_previous( + block_type, previous_filter or base_filter + ) + > 0 ) - or changes_count > 0 - ) - if not need_update: + + # add this block type to the list of filters to be re-uploaded + if should_update_filter: + base_filters_to_update.append(block_type) + # only update the stash if we should AND if + # we aren't already reuploading the filter for this block type + elif should_update_stash: + update_stash = True + + skip_update = len(base_filters_to_update) == 0 and not update_stash + if skip_update: log.info('No new/modified/deleted Blocks in database; skipping MLBF generation') - return + return mlbf.delete() statsd.incr( 'blocklist.cron.upload_mlbf_to_remote_settings.blocked_count', len(mlbf.data.blocked_items), ) + statsd.incr( + 'blocklist.cron.upload_mlbf_to_remote_settings.soft_blocked_count', + len(mlbf.data.soft_blocked_items), + ) statsd.incr( 'blocklist.cron.upload_mlbf_to_remote_settings.not_blocked_count', len(mlbf.data.not_blocked_items), ) - make_base_filter = ( - force_base - or base_filter is None - or previous_filter is None - or mlbf.blocks_changed_since_previous(BlockType.BLOCKED, base_filter) - > BASE_REPLACE_THRESHOLD - ) - - if make_base_filter: - mlbf.generate_and_write_filter() - else: - mlbf.generate_and_write_stash(previous_filter) + mlbf.generate_and_write_stash(previous_filter) if update_stash else None - upload_filter.delay(generation_time, is_base=make_base_filter) + for block_type in base_filters_to_update: + mlbf.generate_and_write_filter(block_type) - if base_filter: - cleanup_old_files.delay(base_filter_id=base_filter.created_at) + upload_filter.delay( + mlbf.created_at, + filter_list=[key.name for key in base_filters_to_update], + upload_stash=update_stash, + ) def process_blocklistsubmissions(): diff --git a/src/olympia/blocklist/management/commands/export_blocklist.py b/src/olympia/blocklist/management/commands/export_blocklist.py index 1846ac92246f..85402295034e 100644 --- a/src/olympia/blocklist/management/commands/export_blocklist.py +++ b/src/olympia/blocklist/management/commands/export_blocklist.py @@ -4,6 +4,7 @@ import olympia.core.logger from olympia.blocklist.mlbf import MLBF +from olympia.blocklist.models import BlockType log = olympia.core.logger.getLogger('z.amo.blocklist') @@ -29,6 +30,11 @@ def add_arguments(self, parser): 'the database', default=None, ) + parser.add_argument( + '--block-type', + help='Block type to export', + default=None, + ) def load_json(self, json_path): with open(json_path) as json_file: @@ -38,6 +44,7 @@ def load_json(self, json_path): def handle(self, *args, **options): log.debug('Exporting blocklist to file') mlbf = MLBF.generate_from_db(options.get('id')) + block_type = BlockType[options.get('block_type')] if options.get('block_guids_input'): mlbf.blocked_items = list( @@ -52,4 +59,4 @@ def handle(self, *args, **options): ) ) - mlbf.generate_and_write_filter() + mlbf.generate_and_write_filter(block_type) diff --git a/src/olympia/blocklist/mlbf.py b/src/olympia/blocklist/mlbf.py index d5abb62fe14a..23fabcdacc53 100644 --- a/src/olympia/blocklist/mlbf.py +++ b/src/olympia/blocklist/mlbf.py @@ -77,7 +77,8 @@ class BaseMLBFLoader: def __init__(self, storage: SafeStorage): self.storage = storage - def data_type_key(self, key: MLBFDataType) -> str: + @classmethod + def data_type_key(cls, key: MLBFDataType) -> str: return key.name.lower() @cached_property @@ -208,31 +209,44 @@ def hash_filter_inputs(cls, input_list): for (guid, version) in input_list ] - @property - def filter_path(self): - return self.storage.path('filter') + def filter_path(self, block_type: BlockType): + return self.storage.path(f'filter-{BaseMLBFLoader.data_type_key(block_type)}') @property def stash_path(self): return self.storage.path('stash.json') - def generate_and_write_filter(self): + def delete(self): + if self.storage.exists(self.storage.base_location): + self.storage.rm_stored_dir(self.storage.base_location) + log.info(f'Deleted {self.storage.base_location}') + + def generate_and_write_filter(self, block_type: BlockType): stats = {} + blocked = self.data[block_type] + + not_blocked = [ + self.data[_block_type] + for _block_type in BlockType + if _block_type != block_type + ] + bloomfilter = generate_mlbf( stats=stats, - blocked=self.data.blocked_items, - not_blocked=self.data.not_blocked_items, + blocked=blocked, + not_blocked=not_blocked, ) # write bloomfilter - mlbf_path = self.filter_path + mlbf_path = self.filter_path(block_type) with self.storage.open(mlbf_path, 'wb') as filter_file: log.info(f'Writing to file {mlbf_path}') bloomfilter.tofile(filter_file) stats['mlbf_filesize'] = os.stat(mlbf_path).st_size log.info(json.dumps(stats)) + return bloomfilter def generate_diffs( self, previous_mlbf: 'MLBF' = None @@ -277,7 +291,7 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): 'unblocked': blocked_removed, } - if waffle.switch_is_active('mlbf-soft-blocks-enabled'): + if waffle.switch_is_active('enable-soft-blocking'): soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED] stash_json['softblocked'] = soft_blocked_added stash_json['unblocked'] = [ diff --git a/src/olympia/blocklist/tasks.py b/src/olympia/blocklist/tasks.py index dbb0f85622b0..ecfafce6dd6d 100644 --- a/src/olympia/blocklist/tasks.py +++ b/src/olympia/blocklist/tasks.py @@ -2,6 +2,7 @@ import os import re from datetime import datetime, timedelta +from typing import List from django.conf import settings from django.contrib.admin.models import CHANGE, LogEntry @@ -21,10 +22,10 @@ REMOTE_SETTINGS_COLLECTION_MLBF, ) from olympia.lib.remote_settings import RemoteSettings -from olympia.zadmin.models import set_config +from olympia.zadmin.models import get_config, set_config from .mlbf import MLBF -from .models import BlocklistSubmission +from .models import BlocklistSubmission, BlockType from .utils import ( datetime_to_ts, ) @@ -35,7 +36,15 @@ bracket_open_regex = re.compile(r'(? 0 + oldest_base_filter_id = None + if is_base: - # clear the collection for the base - we want to be the only filter - server.delete_all_records() - statsd.incr('blocklist.tasks.upload_filter.reset_collection') - # Then the bloomfilter - data = { - 'key_format': MLBF.KEY_FORMAT, - 'generation_time': generation_time, - 'attachment_type': BLOCKLIST_RECORD_MLBF_BASE, - } - with mlbf.storage.open(mlbf.filter_path, 'rb') as filter_file: - attachment = ('filter.bin', filter_file, 'application/octet-stream') - server.publish_attachment(data, attachment) - statsd.incr('blocklist.tasks.upload_filter.upload_mlbf') + for block_type in filter_list: + data = { + 'key_format': MLBF.KEY_FORMAT, + 'generation_time': generation_time, + 'attachment_type': BLOCKLIST_RECORD_MLBF_BASE(block_type), + } + with mlbf.storage.open(mlbf.filter_path(block_type), 'rb') as filter_file: + attachment = ('filter.bin', filter_file, 'application/octet-stream') + server.publish_attachment(data, attachment) + base_filter_id = get_config( + MLBF_BASE_ID_CONFIG_KEY(block_type), json_value=True + ) + if base_filter_id is not None: + if oldest_base_filter_id is None: + oldest_base_filter_id = base_filter_id + else: + oldest_base_filter_id = min(oldest_base_filter_id, base_filter_id) + + statsd.incr('blocklist.tasks.upload_filter.upload_mlbf') statsd.incr('blocklist.tasks.upload_filter.upload_mlbf.base') - else: + + if oldest_base_filter_id is not None: + for record in server.records(): + record_time = ( + record['stash_time'] + if 'stash_time' in record + else record['generation_time'] + ) + if record_time < oldest_base_filter_id: + server.delete_record(record['id']) + + cleanup_old_files.delay(base_filter_id=oldest_base_filter_id) + statsd.incr('blocklist.tasks.upload_filter.reset_collection') + + # It is possible to upload a stash and a filter in the same task + if upload_stash: with mlbf.storage.open(mlbf.stash_path, 'r') as stash_file: stash_data = json.load(stash_file) # If we have a stash, write that @@ -123,8 +161,10 @@ def upload_filter(generation_time, is_base=True): server.complete_session() set_config(MLBF_TIME_CONFIG_KEY, generation_time, json_value=True) - if is_base: - set_config(MLBF_BASE_ID_CONFIG_KEY, generation_time, json_value=True) + for block_type in filter_list: + set_config( + MLBF_BASE_ID_CONFIG_KEY(block_type), generation_time, json_value=True + ) @task diff --git a/src/olympia/blocklist/tests/test_commands.py b/src/olympia/blocklist/tests/test_commands.py index 3df8df064b6a..c33d9fc44482 100644 --- a/src/olympia/blocklist/tests/test_commands.py +++ b/src/olympia/blocklist/tests/test_commands.py @@ -9,6 +9,7 @@ version_factory, ) from olympia.blocklist.mlbf import MLBF +from olympia.blocklist.models import BlockType class TestExportBlocklist(TestCase): @@ -36,6 +37,11 @@ def test_command(self): updated_by=user, ) - call_command('export_blocklist', '1') + call_command('export_blocklist', '1', '--block-type', BlockType.BLOCKED.name) mlbf = MLBF.load_from_storage(1) - assert mlbf.storage.exists(mlbf.filter_path) + assert mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED)) + call_command( + 'export_blocklist', '1', '--block-type', BlockType.SOFT_BLOCKED.name + ) + mlbf = MLBF.load_from_storage(1) + assert mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED)) diff --git a/src/olympia/blocklist/tests/test_cron.py b/src/olympia/blocklist/tests/test_cron.py index 0441c64dd3a8..6febb5a25470 100644 --- a/src/olympia/blocklist/tests/test_cron.py +++ b/src/olympia/blocklist/tests/test_cron.py @@ -26,6 +26,7 @@ ) from olympia.blocklist.mlbf import MLBF from olympia.blocklist.models import Block, BlocklistSubmission, BlockType, BlockVersion +from olympia.blocklist.tasks import upload_filter from olympia.blocklist.utils import datetime_to_ts from olympia.constants.blocklist import MLBF_BASE_ID_CONFIG_KEY, MLBF_TIME_CONFIG_KEY from olympia.zadmin.models import set_config @@ -45,7 +46,6 @@ def setUp(self): self.mocks: dict[str, mock.Mock] = {} for mock_name in ( 'olympia.blocklist.cron.statsd.incr', - 'olympia.blocklist.cron.cleanup_old_files.delay', 'olympia.blocklist.cron.upload_filter.delay', 'olympia.blocklist.cron.get_generation_time', 'olympia.blocklist.cron.get_last_generation_time', @@ -98,11 +98,16 @@ def test_skip_update_unless_force_base(self): assert self.mocks['olympia.blocklist.cron.upload_filter.delay'].called - # Check that a filter was created on the second attempt + # Check that both filters were created on the second attempt mlbf = MLBF.load_from_storage(self.current_time) - assert mlbf.storage.exists(mlbf.filter_path) + assert mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED)) + assert not mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED)) assert not mlbf.storage.exists(mlbf.stash_path) + with override_switch('enable-soft-blocking', active=True): + upload_mlbf_to_remote_settings(force_base=True) + assert mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED)) + def test_skip_update_unless_no_base_mlbf(self): """ skip update unless there is no base mlbf @@ -123,10 +128,12 @@ def test_missing_last_filter_uses_base_filter(self): When there is a base filter and no last filter, fallback to using the base filter """ - self._block_version(is_signed=True) - # Re-created the last filter created after the new block + block_version = self._block_version(is_signed=True) + # Re-create the last filter so we ensure + # the block is already processed comparing to previous MLBF.generate_from_db(self.last_time) + assert datetime_to_ts(block_version.modified) < self.last_time # We skip the update at this point because the new last filter already # accounted for the new block. upload_mlbf_to_remote_settings(force_base=False) @@ -138,30 +145,22 @@ def test_missing_last_filter_uses_base_filter(self): 'olympia.blocklist.cron.get_last_generation_time' ].return_value = None upload_mlbf_to_remote_settings(force_base=False) - assert self.mocks['olympia.blocklist.cron.upload_filter.delay'].called assert ( mock.call( - 'blocklist.cron.upload_mlbf_to_remote_settings.blocked_changed', 1 + self.current_time, + filter_list=[], + upload_stash=True, ) - in self.mocks['olympia.blocklist.cron.statsd.incr'].call_args_list - ) + ) in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list - def test_skip_update_unless_recent_modified_blocks(self): + def test_skip_update_if_unsigned_blocks_added(self): """ - skip update unless there are recent modified blocks + skip update if there are only unsigned new blocks """ - upload_mlbf_to_remote_settings(force_base=False) - assert not self.mocks['olympia.blocklist.cron.upload_filter.delay'].called - - # Now the last filter is older than the most recently modified block. - older_last_time = datetime_to_ts(self.block.modified - timedelta(seconds=1)) - self.mocks[ - 'olympia.blocklist.cron.get_last_generation_time' - ].return_value = older_last_time - MLBF.generate_from_db(older_last_time) + self._block_version(is_signed=False) upload_mlbf_to_remote_settings(force_base=False) - assert self.mocks['olympia.blocklist.cron.upload_filter.delay'].called + assert not self.mocks['olympia.blocklist.cron.upload_filter.delay'].called def test_skip_update_unless_new_blocks(self): """ @@ -180,6 +179,7 @@ def test_send_statsd_counts(self): Send statsd counts for the number of blocked and not blocked items. """ self._block_version(is_signed=True) + self._block_version(block_type=BlockType.SOFT_BLOCKED) upload_mlbf_to_remote_settings() statsd_calls = self.mocks['olympia.blocklist.cron.statsd.incr'].call_args_list @@ -188,6 +188,12 @@ def test_send_statsd_counts(self): mock.call('blocklist.cron.upload_mlbf_to_remote_settings.blocked_count', 1) in statsd_calls ) + assert ( + mock.call( + 'blocklist.cron.upload_mlbf_to_remote_settings.soft_blocked_count', 1 + ) + in statsd_calls + ) assert ( mock.call( 'blocklist.cron.upload_mlbf_to_remote_settings.not_blocked_count', 1 @@ -212,20 +218,42 @@ def test_upload_stash_unless_force_base(self): We expect to upload a stash, unless the force_base is true, in which case we upload a new filter. """ - force_base = False self._block_version(is_signed=True) - upload_mlbf_to_remote_settings(force_base=force_base) + upload_mlbf_to_remote_settings(force_base=False) assert self.mocks[ 'olympia.blocklist.cron.upload_filter.delay' ].call_args_list == [ mock.call( self.current_time, - is_base=force_base, + filter_list=[], + upload_stash=True, ) ] + mlbf = MLBF.load_from_storage(self.current_time) - assert mlbf.storage.exists(mlbf.filter_path) == force_base - assert mlbf.storage.exists(mlbf.stash_path) != force_base + assert not mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED)) + assert mlbf.storage.exists(mlbf.stash_path) + + upload_mlbf_to_remote_settings(force_base=True) + assert mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED)) + assert ( + mock.call( + self.current_time, + filter_list=[BlockType.BLOCKED.name], + upload_stash=False, + ) + ) in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list + + with override_switch('enable-soft-blocking', active=True): + upload_mlbf_to_remote_settings(force_base=True) + assert mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED)) + assert ( + mock.call( + self.current_time, + filter_list=[BlockType.BLOCKED.name, BlockType.SOFT_BLOCKED.name], + upload_stash=False, + ) + ) in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list def test_upload_stash_unless_missing_base_filter(self): """ @@ -238,11 +266,13 @@ def test_upload_stash_unless_missing_base_filter(self): ].call_args_list == [ mock.call( self.current_time, - is_base=False, + filter_list=[], + upload_stash=True, ) ] mlbf = MLBF.load_from_storage(self.current_time) - assert not mlbf.storage.exists(mlbf.filter_path) + assert not mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED)) + assert not mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED)) assert mlbf.storage.exists(mlbf.stash_path) self.mocks[ @@ -252,11 +282,23 @@ def test_upload_stash_unless_missing_base_filter(self): assert ( mock.call( self.current_time, - is_base=True, + filter_list=[BlockType.BLOCKED.name], + upload_stash=False, ) in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list ) - assert mlbf.storage.exists(mlbf.filter_path) + assert mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED)) + + with override_switch('enable-soft-blocking', active=True): + upload_mlbf_to_remote_settings() + assert mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED)) + assert ( + mock.call( + self.current_time, + filter_list=[BlockType.BLOCKED.name, BlockType.SOFT_BLOCKED.name], + upload_stash=False, + ) + ) in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list @mock.patch('olympia.blocklist.cron.BASE_REPLACE_THRESHOLD', 1) def test_upload_stash_unless_enough_changes(self): @@ -271,7 +313,8 @@ def test_upload_stash_unless_enough_changes(self): ].call_args_list == [ mock.call( self.current_time, - is_base=False, + filter_list=[], + upload_stash=True, ) ] mlbf = MLBF.load_from_storage(self.current_time) @@ -288,30 +331,52 @@ def test_upload_stash_unless_enough_changes(self): assert ( mock.call( self.current_time, - is_base=True, + filter_list=[BlockType.BLOCKED.name], + upload_stash=False, ) in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list ) new_mlbf = MLBF.load_from_storage(self.current_time) - assert new_mlbf.storage.exists(new_mlbf.filter_path) + assert new_mlbf.storage.exists(new_mlbf.filter_path(BlockType.BLOCKED)) assert not new_mlbf.storage.exists(new_mlbf.stash_path) - def test_cleanup_old_files(self): + @mock.patch('olympia.blocklist.cron.BASE_REPLACE_THRESHOLD', 1) + def test_upload_stash_even_if_filter_is_updated(self): """ - Cleanup old files only if a base filter already exists. + If enough changes of one type are made, update the filter, but still upload + a stash if there are changes of other types. """ - upload_mlbf_to_remote_settings(force_base=True) + self._block_version(is_signed=True, block_type=BlockType.BLOCKED) + self._block_version(is_signed=True, block_type=BlockType.BLOCKED) + self._block_version(is_signed=True, block_type=BlockType.SOFT_BLOCKED) + upload_mlbf_to_remote_settings() assert self.mocks[ - 'olympia.blocklist.cron.cleanup_old_files.delay' - ].call_args_list == [mock.call(base_filter_id=self.base_time)] + 'olympia.blocklist.cron.upload_filter.delay' + ].call_args_list == [ + mock.call( + self.current_time, + filter_list=[BlockType.BLOCKED.name], + upload_stash=False, + ) + ] - self.mocks[ - 'olympia.blocklist.cron.get_base_generation_time' - ].return_value = None - upload_mlbf_to_remote_settings(force_base=True) - assert ( - self.mocks['olympia.blocklist.cron.cleanup_old_files.delay'].call_count == 1 - ) + with override_switch('enable-soft-blocking', active=True): + self._block_version(is_signed=True, block_type=BlockType.BLOCKED) + self._block_version(is_signed=True, block_type=BlockType.BLOCKED) + upload_mlbf_to_remote_settings() + self.mocks['olympia.blocklist.cron.upload_filter.delay'].assert_called_with( + self.current_time, + filter_list=[BlockType.BLOCKED.name], + upload_stash=True, + ) + + def test_remove_storage_if_no_update(self): + """ + If there is no update, remove the storage used by the current mlbf. + """ + upload_mlbf_to_remote_settings(force_base=False) + assert not self.mocks['olympia.blocklist.cron.upload_filter.delay'].called + assert MLBF.load_from_storage(self.current_time) is None def test_creates_base_filter_if_base_generation_time_invalid(self): """ @@ -345,13 +410,10 @@ def test_dont_skip_update_if_all_blocked_or_not_blocked(self): def test_invalid_cache_results_in_diff(self): self._block_version(block_type=BlockType.BLOCKED) - # First we re-create the last filter including the blocked version - self.mocks[ - 'olympia.blocklist.cron.get_generation_time' - ].return_value = self.last_time + # First we create the current filter including the blocked version upload_mlbf_to_remote_settings() - base_mlbf = MLBF.load_from_storage(self.last_time) + base_mlbf = MLBF.load_from_storage(self.current_time) # Remove the blocked version from the cache.json file so we can test that # the next generation includes the blocked version. @@ -362,23 +424,44 @@ def test_invalid_cache_results_in_diff(self): json.dump(data, f) f.truncate() - # Reset the generation time to the current time so we can test that the + # Set the generation time to after the current time so we can test that the # diff includes the blocked version after it is removed from the cache.json + next_time = self.current_time + 1 self.mocks[ 'olympia.blocklist.cron.get_generation_time' - ].return_value = self.current_time + ].return_value = next_time upload_mlbf_to_remote_settings() # We expect to upload a stash because the cache.json we are comparing against # is missing the blocked version. assert ( mock.call( - self.current_time, - is_base=False, + next_time, + filter_list=[], + upload_stash=True, ) in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list ) + def test_pass_correct_arguments_to_upload_filter(self): + self.mocks['olympia.blocklist.cron.upload_filter.delay'].stop() + with mock.patch( + 'olympia.blocklist.cron.upload_filter.delay', wraps=upload_filter.delay + ) as spy_delay: + upload_mlbf_to_remote_settings(force_base=True) + spy_delay.assert_called_with( + self.current_time, + filter_list=[BlockType.BLOCKED.name], + upload_stash=False, + ) + with override_switch('enable-soft-blocking', active=True): + upload_mlbf_to_remote_settings(force_base=True) + spy_delay.assert_called_with( + self.current_time, + filter_list=[BlockType.BLOCKED.name, BlockType.SOFT_BLOCKED.name], + upload_stash=False, + ) + class TestTimeMethods(TestCase): @freeze_time('2024-10-10 12:34:56') @@ -392,9 +475,10 @@ def test_get_last_generation_time(self): assert get_last_generation_time() == 1 def test_get_base_generation_time(self): - assert get_base_generation_time() is None - set_config(MLBF_BASE_ID_CONFIG_KEY, 1) - assert get_base_generation_time() == 1 + for block_type in BlockType: + assert get_base_generation_time(block_type) is None + set_config(MLBF_BASE_ID_CONFIG_KEY(block_type), 1) + assert get_base_generation_time(block_type) == 1 @pytest.mark.django_db diff --git a/src/olympia/blocklist/tests/test_mlbf.py b/src/olympia/blocklist/tests/test_mlbf.py index 0637b56f97ea..af1d67864ca8 100644 --- a/src/olympia/blocklist/tests/test_mlbf.py +++ b/src/olympia/blocklist/tests/test_mlbf.py @@ -560,7 +560,7 @@ def test_diff_all_possible_changes(self): 'unblocked': [], } - with override_switch('mlbf-soft-blocks-enabled', active=True): + with override_switch('enable-soft-blocking', active=True): assert first_mlbf.generate_and_write_stash() == { 'blocked': [five_hash, six_hash], 'softblocked': [three_hash, four_hash], @@ -612,7 +612,7 @@ def test_diff_all_possible_changes(self): 'unblocked': [five_hash, six_hash], } - with override_switch('mlbf-soft-blocks-enabled', active=True): + with override_switch('enable-soft-blocking', active=True): assert second_mlbf.generate_and_write_stash(previous_mlbf=first_mlbf) == { 'blocked': [three_hash, two_hash], 'softblocked': [five_hash, one_hash], @@ -638,7 +638,7 @@ def test_generate_stash_returns_expected_stash(self): 'unblocked': [], } - with override_switch('mlbf-soft-blocks-enabled', active=True): + with override_switch('enable-soft-blocking', active=True): assert mlbf.generate_and_write_stash() == { 'blocked': MLBF.hash_filter_inputs(expected_blocked), 'softblocked': [], @@ -658,13 +658,41 @@ def test_generate_stash_returns_expected_stash(self): 'unblocked': MLBF.hash_filter_inputs(expected_unblocked), } - with override_switch('mlbf-soft-blocks-enabled', active=True): + with override_switch('enable-soft-blocking', active=True): assert next_mlbf.generate_and_write_stash(previous_mlbf=mlbf) == { 'blocked': [], 'softblocked': [], 'unblocked': MLBF.hash_filter_inputs(expected_unblocked), } + def test_generate_filter_returns_expected_data(self): + addon, block = self._blocked_addon() + not_blocked = self._version(addon) + not_blocked_version = not_blocked.version + hard_blocked = self._block_version( + block, self._version(addon), block_type=BlockType.BLOCKED + ) + hard_blocked_version = hard_blocked.version.version + soft_blocked = self._block_version( + block, self._version(addon), block_type=BlockType.SOFT_BLOCKED + ) + soft_blocked_version = soft_blocked.version.version + mlbf = MLBF.generate_from_db('test') + + mlbf.generate_and_write_filter(BlockType.BLOCKED).verify( + include=MLBF.hash_filter_inputs([(addon.guid, hard_blocked_version)]), + exclude=MLBF.hash_filter_inputs( + [(addon.guid, soft_blocked_version), (addon.guid, not_blocked_version)] + ), + ) + + mlbf.generate_and_write_filter(BlockType.SOFT_BLOCKED).verify( + include=MLBF.hash_filter_inputs([(addon.guid, soft_blocked_version)]), + exclude=MLBF.hash_filter_inputs( + [(addon.guid, hard_blocked_version), (addon.guid, not_blocked_version)] + ), + ) + def test_changed_count_returns_expected_count(self): addon, block = self._blocked_addon() self._block_version(block, self._version(addon), block_type=BlockType.BLOCKED) @@ -718,16 +746,19 @@ def test_changed_count_returns_expected_count(self): == 1 ) + def _test_not_raises_if_versions_blocked(self, block_type: BlockType): + mlbf = MLBF.generate_from_db('test') + self._blocked_addon(file_kw={'is_signed': True}, block_type=block_type) + assert mlbf.data[block_type] == [] + mlbf.generate_and_write_filter(block_type) + def test_generate_filter_not_raises_if_all_versions_unblocked(self): """ When we create a bloom filter where all versions fall into the "not filtered" category This can create invalid error rates because the error rate depends on these numbers being non-zero. """ - mlbf = MLBF.generate_from_db('test') - self._blocked_addon(file_kw={'is_signed': True}) - assert mlbf.data.blocked_items == [] - mlbf.generate_and_write_filter() + self._test_not_raises_if_versions_blocked(BlockType.BLOCKED) def test_generate_filter_not_raises_if_all_versions_blocked(self): """ @@ -735,10 +766,7 @@ def test_generate_filter_not_raises_if_all_versions_blocked(self): the "not filtered" category This can create invalid error rates because the error rate depends on these numbers being non-zero. """ - mlbf = MLBF.generate_from_db('test') - self._blocked_addon(file_kw={'is_signed': False}) - assert mlbf.data.not_blocked_items == [] - mlbf.generate_and_write_filter() + self._test_not_raises_if_versions_blocked(BlockType.SOFT_BLOCKED) def test_duplicate_guid_is_blocked(self): """ diff --git a/src/olympia/blocklist/tests/test_tasks.py b/src/olympia/blocklist/tests/test_tasks.py index cf17fc07dca1..bba82f4442eb 100644 --- a/src/olympia/blocklist/tests/test_tasks.py +++ b/src/olympia/blocklist/tests/test_tasks.py @@ -1,6 +1,7 @@ import json import os from datetime import datetime, timedelta +from typing import Dict, List from unittest import TestCase, mock from django.conf import settings @@ -18,6 +19,7 @@ ) from olympia.blocklist.mlbf import MLBF from olympia.constants.blocklist import MLBF_BASE_ID_CONFIG_KEY, MLBF_TIME_CONFIG_KEY +from olympia.zadmin.models import set_config from ..models import BlocklistSubmission, BlockType, BlockVersion from ..tasks import ( @@ -132,12 +134,14 @@ def setUp(self): prefix = 'olympia.blocklist.tasks.' self.mocks = { - 'delete_all_records': f'{prefix}RemoteSettings.delete_all_records', + 'records': f'{prefix}RemoteSettings.records', + 'delete_record': f'{prefix}RemoteSettings.delete_record', 'publish_attachment': f'{prefix}RemoteSettings.publish_attachment', 'publish_record': f'{prefix}RemoteSettings.publish_record', 'complete_session': f'{prefix}RemoteSettings.complete_session', 'set_config': f'{prefix}set_config', 'statsd.incr': f'{prefix}statsd.incr', + 'cleanup_old_files.delay': f'{prefix}cleanup_old_files.delay', } for mock_name, mock_path in self.mocks.items(): patcher = mock.patch(mock_path) @@ -157,43 +161,136 @@ def _block_version( block=block, version=version, block_type=block_type ) - def test_upload_base_filter(self): + def test_server_setup(self): + pass + + def test_statsd_increments(self): + pass + + def test_invalid_block_type_raises(self): + with self.assertRaises(ValueError): + BLOCKLIST_RECORD_MLBF_BASE('foo') + + with self.assertRaises(KeyError): + upload_filter.delay( + self.generation_time, + filter_list=['foo'], + ) + + def _test_upload_base_filter(self, *block_types: BlockType): self._block_version(is_signed=True) mlbf = MLBF.generate_from_db(self.generation_time) - mlbf.generate_and_write_filter() + for block_type in block_types: + mlbf.generate_and_write_filter(block_type) - upload_filter.delay(self.generation_time, is_base=True) + upload_filter.delay( + self.generation_time, + filter_list=[block_type.name for block_type in block_types], + ) - assert self.mocks['delete_all_records'].called - with mlbf.storage.open(mlbf.filter_path, 'rb') as filter_file: - actual_data, actual_attchment = self.mocks[ - 'publish_attachment' - ].call_args_list[0][0] + assert not self.mocks['delete_record'].called + actual_files = [ + mock[0][1][1].name + for mock in self.mocks['publish_attachment'].call_args_list + ] - assert actual_data == { - 'key_format': MLBF.KEY_FORMAT, - 'generation_time': self.generation_time, - 'attachment_type': BLOCKLIST_RECORD_MLBF_BASE, - } - name, file, content_type = actual_attchment - assert name == 'filter.bin' - assert file.name == filter_file.name - assert content_type == 'application/octet-stream' - - assert all( - call in self.mocks['statsd.incr'].call_args_list - for call in [ - mock.call('blocklist.tasks.upload_filter.reset_collection'), - mock.call('blocklist.tasks.upload_filter.upload_mlbf.base'), - mock.call('blocklist.tasks.upload_filter.upload_mlbf'), - ] - ) + for block_type in block_types: + with mlbf.storage.open(mlbf.filter_path(block_type), 'rb') as filter_file: + assert filter_file.name in actual_files + expected_call = mock.call( + { + 'key_format': MLBF.KEY_FORMAT, + 'generation_time': self.generation_time, + 'attachment_type': BLOCKLIST_RECORD_MLBF_BASE(block_type), + }, + ('filter.bin', mock.ANY, 'application/octet-stream'), + ) + assert expected_call in self.mocks['publish_attachment'].call_args_list assert self.mocks['complete_session'].called - assert self.mocks['set_config'].call_args_list == [ - mock.call(MLBF_TIME_CONFIG_KEY, self.generation_time, json_value=True), - mock.call(MLBF_BASE_ID_CONFIG_KEY, self.generation_time, json_value=True), - ] + assert ( + mock.call(MLBF_TIME_CONFIG_KEY, self.generation_time, json_value=True) + ) in self.mocks['set_config'].call_args_list + + for block_type in block_types: + assert ( + mock.call( + MLBF_BASE_ID_CONFIG_KEY(block_type), + self.generation_time, + json_value=True, + ) + ) in self.mocks['set_config'].call_args_list + + def test_upload_blocked_filter(self): + self._test_upload_base_filter(BlockType.BLOCKED) + + def test_upload_soft_blocked_filter(self): + self._test_upload_base_filter(BlockType.SOFT_BLOCKED) + + def test_upload_soft_and_blocked_filter(self): + self._test_upload_base_filter(BlockType.BLOCKED, BlockType.SOFT_BLOCKED) + + def _test_cleanup_old_records( + self, + filter_list: Dict[BlockType, int], + records: List[Dict[str, int]], + expected_calls: List[any], + ): + self._block_version(is_signed=True) + mlbf = MLBF.generate_from_db(self.generation_time) + for block_type, base_id in filter_list.items(): + mlbf.generate_and_write_filter(block_type) + set_config(MLBF_BASE_ID_CONFIG_KEY(block_type), base_id, json_value=True) + + self.mocks['records'].return_value = records + upload_filter.delay( + self.generation_time, + filter_list=[block_type.name for block_type in filter_list], + ) + + assert self.mocks['delete_record'].call_args_list == expected_calls + + if len(filter_list.values()) > 0: + self.mocks['cleanup_old_files.delay'].assert_called_with( + base_filter_id=min(filter_list.values()) + ) + self.mocks['statsd.incr'].assert_called_with( + 'blocklist.tasks.upload_filter.reset_collection' + ) + + def test_skip_cleanup_when_no_filters(self): + self._test_cleanup_old_records( + filter_list={}, + records=[{'id': '0', 'generation_time': self.generation_time}], + expected_calls=[], + ) + + def test_cleanup_old_records(self): + self._test_cleanup_old_records( + filter_list={ + BlockType.BLOCKED: self.generation_time, + }, + records=[ + {'id': '0', 'generation_time': self.generation_time - 1}, + {'id': '1', 'generation_time': self.generation_time}, + {'id': '2', 'generation_time': self.generation_time + 1}, + ], + expected_calls=[mock.call('0')], + ) + + def test_cleanup_oldest_records(self): + self._test_cleanup_old_records( + filter_list={ + BlockType.BLOCKED: self.generation_time + 2, + BlockType.SOFT_BLOCKED: self.generation_time + 1, + }, + records=[ + {'id': '0', 'generation_time': self.generation_time - 1}, + {'id': '1', 'generation_time': self.generation_time}, + {'id': '2', 'generation_time': self.generation_time + 1}, + ], + expected_calls=[mock.call('0'), mock.call('1')], + ) def test_upload_stashed_filter(self): old_mlbf = MLBF.generate_from_db(self.generation_time - 1) @@ -201,9 +298,9 @@ def test_upload_stashed_filter(self): mlbf = MLBF.generate_from_db(self.generation_time) mlbf.generate_and_write_stash(old_mlbf) - upload_filter.delay(self.generation_time, is_base=False) + upload_filter.delay(self.generation_time, upload_stash=True) - assert not self.mocks['delete_all_records'].called + assert not self.mocks['delete_record'].called with mlbf.storage.open(mlbf.stash_path, 'rb') as stash_file: actual_stash = self.mocks['publish_record'].call_args_list[0][0][0] stash_data = json.load(stash_file) @@ -233,31 +330,18 @@ def test_upload_stashed_filter(self): def test_raises_when_no_filter_exists(self): with self.assertRaises(FileNotFoundError): - upload_filter.delay(self.generation_time) + upload_filter.delay( + self.generation_time, filter_list=[BlockType.BLOCKED.name] + ) def test_raises_when_no_stash_exists(self): with self.assertRaises(FileNotFoundError): - upload_filter.delay(self.generation_time) - - def test_default_is_base_is_true(self): - MLBF.generate_from_db(self.generation_time).generate_and_write_filter() - upload_filter.delay(self.generation_time) - assert self.mocks['delete_all_records'].called - - def test_raises_missing_stash(self): - mlbf = MLBF.generate_from_db(self.generation_time) - mlbf.generate_and_write_filter() - - with self.assertRaises(FileNotFoundError): - upload_filter.delay(self.generation_time, is_base=False) + upload_filter.delay(self.generation_time, upload_stash=True) + def test_default_is_no_op(self): + MLBF.generate_from_db(self.generation_time).generate_and_write_filter( + BlockType.BLOCKED + ) upload_filter.delay(self.generation_time) - - def test_raises_missing_filter(self): - mlbf = MLBF.generate_from_db(self.generation_time) - mlbf.generate_and_write_stash(mlbf) - - with self.assertRaises(FileNotFoundError): - upload_filter.delay(self.generation_time, is_base=True) - - upload_filter.delay(self.generation_time, is_base=False) + assert not self.mocks['delete_record'].called + assert not self.mocks['publish_record'].called diff --git a/src/olympia/constants/blocklist.py b/src/olympia/constants/blocklist.py index 8cda2e5c9b11..779004bf03da 100644 --- a/src/olympia/constants/blocklist.py +++ b/src/olympia/constants/blocklist.py @@ -1,8 +1,15 @@ # How many guids should there be in the stashes before we make a new base. +from olympia.blocklist.models import BlockType + + BASE_REPLACE_THRESHOLD = 5_000 # Config keys used to track recent mlbf ids MLBF_TIME_CONFIG_KEY = 'blocklist_mlbf_generation_time' -MLBF_BASE_ID_CONFIG_KEY = 'blocklist_mlbf_base_id' + + +def MLBF_BASE_ID_CONFIG_KEY(block_type: BlockType): + return f'blocklist_mlbf_base_id_{block_type.name.lower()}' + REMOTE_SETTINGS_COLLECTION_MLBF = 'addons-bloomfilters' diff --git a/src/olympia/lib/remote_settings.py b/src/olympia/lib/remote_settings.py index fdb08a6694dd..4778baa4be20 100644 --- a/src/olympia/lib/remote_settings.py +++ b/src/olympia/lib/remote_settings.py @@ -105,6 +105,14 @@ def publish_attachment(self, data, attachment, legacy_id=None): self._changes = True return response.json().get('data', {}) + def records(self): + url = ( + f'{settings.REMOTE_SETTINGS_WRITER_URL}buckets/{self.bucket}/' + f'collections/{self.collection}/records' + ) + response = requests.get(url, headers=self.headers) + return response.json().get('data', []) + def delete_record(self, legacy_id): url = ( f'{settings.REMOTE_SETTINGS_WRITER_URL}buckets/{self.bucket}/' diff --git a/src/olympia/lib/tests/test_remote_settings.py b/src/olympia/lib/tests/test_remote_settings.py index 45012f38112d..9b677ba319a3 100644 --- a/src/olympia/lib/tests/test_remote_settings.py +++ b/src/olympia/lib/tests/test_remote_settings.py @@ -21,6 +21,19 @@ def test_bucket_not_altered(self): server = RemoteSettings('foo', 'baa') assert server.bucket == 'foo' + def test_records(self): + server = RemoteSettings('foo', 'baa') + + for data in [{'id': 'an-id'}], []: + responses.add( + responses.GET, + settings.REMOTE_SETTINGS_WRITER_URL + + 'buckets/foo/collections/baa/records', + content_type='application/json', + json={'data': data}, + ) + assert server.records() == data + def test_publish_record(self): server = RemoteSettings('foo', 'baa') server._setup_done = True