From bff0a4dd1002cdb902313a45aea67da4cfa95196 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 19 Sep 2024 18:22:10 +0200 Subject: [PATCH] Implement #425 In spirit, at least --- backend/bootstrap.py | 6 ++- backend/lib/manager.py | 6 +-- backend/lib/processor.py | 36 ++++++++--------- backend/lib/search.py | 3 +- backend/lib/worker.py | 2 +- backend/workers/api.py | 7 +++- backend/workers/check_updates.py | 7 ++-- backend/workers/cleanup_tempfiles.py | 3 +- backend/workers/datasource_metrics.py | 11 +++--- backend/workers/expire_items.py | 6 ++- backend/workers/restart_4cat.py | 19 +++++---- common/config_manager.py | 7 +++- common/lib/dataset.py | 39 ++++++++++--------- common/lib/helpers.py | 18 +++++++++ common/lib/module_loader.py | 36 ++++++++++------- datasources/audio_to_text/audio_to_text.py | 14 +++---- datasources/bitchute/search_bitchute.py | 2 +- datasources/dmi-tcat/search_tcat.py | 10 ++--- datasources/dmi-tcatv2/search_tcat_v2.py | 10 ++--- datasources/douban/search_douban.py | 4 +- datasources/fourcat_import/import_4cat.py | 4 +- .../fourchan/scrapers/download_images.py | 2 +- .../fourchan/scrapers/scrape_threads.py | 8 ++-- datasources/fourchan/search_4chan.py | 14 +++---- datasources/media_import/import_media.py | 7 ++-- datasources/reddit/search_reddit.py | 27 ++++++------- datasources/telegram/search_telegram.py | 30 +++++++------- datasources/tiktok_urls/search_tiktok_urls.py | 4 +- datasources/tumblr/search_tumblr.py | 21 +++++----- datasources/twitterv2/search_twitter.py | 22 +++++------ datasources/upload/import_csv.py | 4 +- datasources/vk/search_vk.py | 10 ++--- processors/audio/audio_extractor.py | 15 ++++--- processors/conversion/clarifai_to_csv.py | 3 +- processors/conversion/consolidate_urls.py | 6 ++- processors/conversion/csv_to_excel.py | 3 +- processors/conversion/csv_to_json.py | 3 +- processors/conversion/extract_urls.py | 7 +++- processors/conversion/merge_datasets.py | 3 +- processors/conversion/ndjson_to_csv.py | 3 +- processors/conversion/split_by_thread.py | 3 +- processors/conversion/stringify.py | 3 +- processors/conversion/tcat_auto_upload.py | 27 ++++++------- .../conversion/twitter_ndjson_to_tcat_json.py | 2 +- processors/conversion/view_metadata.py | 3 +- processors/conversion/vision_api_to_csv.py | 3 +- processors/filtering/accent_fold.py | 9 +++-- processors/filtering/base_filter.py | 3 +- processors/filtering/column_filter.py | 8 ++-- processors/filtering/date_filter.py | 3 +- processors/filtering/lexical_filter.py | 3 +- processors/filtering/random_filter.py | 3 +- processors/filtering/reddit_get_votes.py | 5 ++- processors/filtering/remove_author_info.py | 5 ++- processors/filtering/tiktok_refresh.py | 3 +- processors/filtering/unique_filter.py | 34 ++++++++-------- processors/filtering/wildcard_filter.py | 3 +- processors/filtering/write_annotations.py | 3 +- processors/machine_learning/annotate_text.py | 14 +++---- .../machine_learning/blip2_image_caption.py | 11 +++--- .../clip_categorize_images.py | 11 +++--- .../machine_learning/generate_images.py | 10 ++--- processors/machine_learning/pix-plot.py | 10 ++--- .../machine_learning/text_from_image.py | 6 +-- .../whisper_speech_to_text.py | 11 +++--- processors/metrics/clarifai_api.py | 3 +- processors/metrics/count_posts.py | 5 ++- processors/metrics/debate_metrics.py | 3 +- processors/metrics/google_vision_api.py | 3 +- processors/metrics/hatebase.py | 34 ++++++++-------- processors/metrics/most_quoted.py | 3 +- processors/metrics/overtime-hatebase.py | 3 +- processors/metrics/rank_attribute.py | 34 ++++++++-------- processors/metrics/thread_metadata.py | 3 +- processors/metrics/top_hatebase.py | 3 +- processors/metrics/top_images.py | 7 ++-- processors/metrics/url_titles.py | 9 +++-- processors/metrics/vocabulary_overtime.py | 3 +- processors/metrics/youtube_metadata.py | 3 +- .../networks/clarifai_bipartite_network.py | 3 +- processors/networks/colink_urls.py | 3 +- processors/networks/cotag_network.py | 3 +- processors/networks/coword_network.py | 3 +- .../google_vision_bipartite_network.py | 3 +- processors/networks/google_vision_network.py | 3 +- .../networks/hash_similarity_network.py | 5 ++- processors/networks/quote_network.py | 3 +- processors/networks/two-column-network.py | 7 ++-- processors/networks/user_hashtag_network.py | 5 ++- processors/networks/wikipedia_network.py | 3 +- processors/presets/annotate-images.py | 3 +- processors/presets/monthly-histogram.py | 3 +- processors/presets/neologisms.py | 12 +++--- processors/presets/similar-words.py | 3 +- processors/presets/top-hashtags.py | 4 +- processors/presets/upload-to-dmi-tcat.py | 22 +++++------ processors/presets/video-scene-timelines.py | 4 +- processors/text-analysis/collocations.py | 3 +- .../text-analysis/documents_per_topic.py | 3 +- .../text-analysis/generate_embeddings.py | 3 +- processors/text-analysis/post_topic_matrix.py | 9 +++-- processors/text-analysis/similar_words.py | 10 ++--- processors/text-analysis/split_sentences.py | 24 ++++++------ processors/text-analysis/tf_idf.py | 3 +- processors/text-analysis/tokenise.py | 34 ++++++++-------- processors/text-analysis/top_vectors.py | 3 +- processors/text-analysis/topic_modeling.py | 3 +- processors/text-analysis/topic_words.py | 3 +- processors/text-analysis/vectorise.py | 3 +- processors/twitter/aggregate_stats.py | 5 ++- processors/twitter/base_twitter_stats.py | 3 +- processors/twitter/custom_stats.py | 3 +- processors/twitter/hashtag_stats.py | 3 +- processors/twitter/identical_tweets.py | 3 +- processors/twitter/mention_export.py | 6 ++- processors/twitter/source_stats.py | 3 +- processors/twitter/twitter_stats.py | 3 +- processors/twitter/user_stats_individual.py | 3 +- processors/twitter/user_visibility.py | 3 +- .../visualisation/download-telegram-images.py | 11 +++--- .../visualisation/download-telegram-videos.py | 13 ++++--- processors/visualisation/download_images.py | 36 +++++++++-------- processors/visualisation/download_tiktok.py | 25 ++++++------ processors/visualisation/download_videos.py | 14 ++++--- processors/visualisation/histwords.py | 11 ++---- .../visualisation/image_category_wall.py | 16 ++++---- processors/visualisation/image_wall.py | 9 +++-- processors/visualisation/image_wall_w_text.py | 16 ++++---- processors/visualisation/isoviz.py | 3 +- processors/visualisation/rankflow.py | 3 +- processors/visualisation/vector_histogram.py | 3 +- processors/visualisation/video_frames.py | 6 ++- processors/visualisation/video_hasher.py | 25 ++++++------ .../visualisation/video_scene_frames.py | 2 +- .../visualisation/video_scene_identifier.py | 2 +- processors/visualisation/video_stack.py | 5 ++- processors/visualisation/video_timelines.py | 3 +- processors/visualisation/word-cloud.py | 5 ++- processors/visualisation/word-trees.py | 4 +- processors/visualisation/youtube_imagewall.py | 3 +- .../visualisation/youtube_thumbnails.py | 3 +- webtool/__init__.py | 11 ++++-- webtool/lib/template_filters.py | 13 +++---- .../templates/components/result-child.html | 6 +-- .../templates/components/result-details.html | 6 +-- .../templates/components/result-metadata.html | 2 +- webtool/views/api_explorer.py | 4 +- webtool/views/api_tool.py | 17 ++++---- webtool/views/views_admin.py | 6 +-- webtool/views/views_dataset.py | 5 +-- webtool/views/views_extensions.py | 4 -- webtool/views/views_misc.py | 4 -- webtool/views/views_restart.py | 5 +-- webtool/views/views_user.py | 3 -- 154 files changed, 704 insertions(+), 582 deletions(-) diff --git a/backend/bootstrap.py b/backend/bootstrap.py index 5e1048439..49f85dfce 100644 --- a/backend/bootstrap.py +++ b/backend/bootstrap.py @@ -8,6 +8,7 @@ from common.lib.queue import JobQueue from common.lib.database import Database +from common.lib.module_loader import ModuleCollector from backend.lib.manager import WorkerManager from common.lib.logger import Logger @@ -66,9 +67,12 @@ def run(as_daemon=True, log_level="INFO"): config.with_db(db) config.ensure_database() + # load 4CAT modules and cache the results + modules = ModuleCollector(config=config, write_cache=True) + # make it happen # this is blocking until the back-end is shut down - WorkerManager(logger=log, database=db, queue=queue, as_daemon=as_daemon) + WorkerManager(logger=log, database=db, queue=queue, modules=modules, as_daemon=as_daemon) # clean up pidfile, if running as daemon if as_daemon: diff --git a/backend/lib/manager.py b/backend/lib/manager.py index 94280f588..4d5ef44aa 100644 --- a/backend/lib/manager.py +++ b/backend/lib/manager.py @@ -4,7 +4,6 @@ import signal import time -from common.lib.module_loader import ModuleCollector from common.lib.exceptions import JobClaimedException @@ -22,19 +21,20 @@ class WorkerManager: pool = [] looping = True - def __init__(self, queue, database, logger, as_daemon=True): + def __init__(self, queue, database, logger, modules, as_daemon=True): """ Initialize manager :param queue: Job queue :param database: Database handler :param logger: Logger object + :param modules: Modules cache via ModuleLoader() :param bool as_daemon: Whether the manager is being run as a daemon """ self.queue = queue self.db = database self.log = logger - self.modules = ModuleCollector(write_config=True) + self.modules = modules if as_daemon: signal.signal(signal.SIGTERM, self.abort) diff --git a/backend/lib/processor.py b/backend/lib/processor.py index e9e4d85a4..a69bce185 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -19,7 +19,7 @@ from common.lib.helpers import get_software_commit, remove_nuls, send_email from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException, DataSetException, MapItemException) -from common.config_manager import config, ConfigWrapper +from common.config_manager import ConfigWrapper from common.lib.user import User @@ -37,14 +37,14 @@ class BasicProcessor(FourcatModule, BasicWorker, metaclass=abc.ABCMeta): useful is another question). To determine whether a processor can process a given dataset, you can - define a `is_compatible_with(FourcatModule module=None, str user=None):) -> bool` class + define a `is_compatible_with(FourcatModule module=None, config=None):) -> bool` class method which takes a dataset as argument and returns a bool that determines if this processor is considered compatible with that dataset. For example: .. code-block:: python @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): return module.type == "linguistic-features" @@ -109,11 +109,10 @@ def work(self): self.job.finish() return - # set up config reader using the worker's DB connection and the dataset - # creator. This ensures that if a value has been overriden for the owner, - # the overridden value is used instead. - config.with_db(self.db) - self.config = ConfigWrapper(config=config, user=User.get_by_name(self.db, self.owner)) + # set up config reader wrapping the worker's config manager, which is + # in turn the one passed to it by the WorkerManager, which is the one + # originally loaded in bootstrap + self.config = ConfigWrapper(config=self.config, user=User.get_by_name(self.db, self.owner)) if self.dataset.data.get("key_parent", None): # search workers never have parents (for now), so we don't need to @@ -170,7 +169,7 @@ def work(self): # get parameters # if possible, fill defaults where parameters are not provided given_parameters = self.dataset.parameters.copy() - all_parameters = self.get_options(self.dataset) + all_parameters = self.get_options(self.dataset, config=self.config) self.parameters = { param: given_parameters.get(param, all_parameters.get(param, {}).get("default")) for param in [*all_parameters.keys(), *given_parameters.keys()] @@ -179,7 +178,7 @@ def work(self): # now the parameters have been loaded into memory, clear any sensitive # ones. This has a side-effect that a processor may not run again # without starting from scratch, but this is the price of progress - options = self.get_options(self.dataset.get_parent()) + options = self.get_options(self.dataset.get_parent(), config=self.config) for option, option_settings in options.items(): if option_settings.get("sensitive"): self.dataset.delete_parameter(option) @@ -241,7 +240,7 @@ def after_process(self): next_parameters = next.get("parameters", {}) next_type = next.get("type", "") try: - available_processors = self.dataset.get_available_processors(user=self.dataset.creator) + available_processors = self.dataset.get_available_processors(config=self.config) except ValueError: self.log.info("Trying to queue next processor, but parent dataset no longer exists, halting") break @@ -329,7 +328,7 @@ def after_process(self): self.job.finish() - if config.get('mail.server') and self.dataset.get_parameters().get("email-complete", False): + if self.config.get('mail.server') and self.dataset.get_parameters().get("email-complete", False): owner = self.dataset.get_parameters().get("email-complete", False) # Check that username is email address if re.match(r"[^@]+\@.*?\.[a-zA-Z]+", owner): @@ -340,8 +339,8 @@ def after_process(self): import html2text self.log.debug("Sending email to %s" % owner) - dataset_url = ('https://' if config.get('flask.https') else 'http://') + config.get('flask.server_name') + '/results/' + self.dataset.key - sender = config.get('mail.noreply') + dataset_url = ('https://' if self.config.get('flask.https') else 'http://') + self.config.get('flask.server_name') + '/results/' + self.dataset.key + sender = self.config.get('mail.noreply') message = MIMEMultipart("alternative") message["From"] = sender message["To"] = owner @@ -778,7 +777,7 @@ def is_filter(cls): return hasattr(cls, "category") and cls.category and "filter" in cls.category.lower() @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -787,12 +786,11 @@ def get_options(cls, parent_dataset=None, user=None): fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters. + :param config: :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. - """ + + return cls.options if hasattr(cls, "options") else {} @classmethod diff --git a/backend/lib/search.py b/backend/lib/search.py index 15b3982d6..62ffafe2f 100644 --- a/backend/lib/search.py +++ b/backend/lib/search.py @@ -63,7 +63,8 @@ def process(self): query_parameters = self.dataset.get_parameters() results_file = self.dataset.get_results_path() - self.log.info("Querying: %s" % str({k: v for k, v in query_parameters.items() if not self.get_options().get(k, {}).get("sensitive", False)})) + self.log.info("Querying: %s" % str({k: v for k, v in query_parameters.items() if not self.get_options( + config=self.config).get(k, {}).get("sensitive", False)})) # Execute the relevant query (string-based, random, countryflag-based) try: diff --git a/backend/lib/worker.py b/backend/lib/worker.py index 17b1b935c..565bc9d6d 100644 --- a/backend/lib/worker.py +++ b/backend/lib/worker.py @@ -86,7 +86,7 @@ def __init__(self, logger, job, queue=None, manager=None, modules=None): self.manager = manager self.job = job self.init_time = int(time.time()) - self.config = ConfigDummy() + self.config = modules.config # ModuleCollector cannot be easily imported into a worker because it itself # imports all workers, so you get a recursive import that Python (rightly) blocks diff --git a/backend/workers/api.py b/backend/workers/api.py index dc3bf2d22..2569aab1d 100644 --- a/backend/workers/api.py +++ b/backend/workers/api.py @@ -15,8 +15,8 @@ class InternalAPI(BasicWorker): ensure_job = {"remote_id": "localhost"} - host = config.get('API_HOST') - port = config.get('API_PORT') + host = None + port = None def work(self): """ @@ -27,6 +27,9 @@ def work(self): :return: """ + self.host = self.config.get('API_HOST') + self.port = self.config.get('API_PORT') + if self.port == 0: # if configured not to listen, just loop until the backend shuts # down we can't return here immediately, since this is a worker, diff --git a/backend/workers/check_updates.py b/backend/workers/check_updates.py index 5dee4a998..bfe05d148 100644 --- a/backend/workers/check_updates.py +++ b/backend/workers/check_updates.py @@ -2,7 +2,6 @@ import requests import json -from common.config_manager import config from common.lib.helpers import add_notification, get_github_version from backend.lib.worker import BasicWorker from pathlib import Path @@ -22,11 +21,11 @@ class UpdateChecker(BasicWorker): max_workers = 1 # check once every three hours - ensure_job = {"remote_id": config.get("4cat.github_url"), "interval": 10800} + ensure_job = {"remote_id": self.config.get("4cat.github_url"), "interval": 10800} def work(self): - versionfile = Path(config.get("PATH_ROOT"), "config/.current-version") - repo_url = config.get("4cat.github_url") + versionfile = Path(self.config.get("PATH_ROOT"), "config/.current-version") + repo_url = self.config.get("4cat.github_url") if not versionfile.exists() or not repo_url: # need something to compare against... diff --git a/backend/workers/cleanup_tempfiles.py b/backend/workers/cleanup_tempfiles.py index 51e96fd57..6ba67f0d0 100644 --- a/backend/workers/cleanup_tempfiles.py +++ b/backend/workers/cleanup_tempfiles.py @@ -6,7 +6,6 @@ from pathlib import Path -from common.config_manager import config from backend.lib.worker import BasicWorker from common.lib.dataset import DataSet from common.lib.exceptions import WorkerInterruptedException, DataSetException @@ -34,7 +33,7 @@ def work(self): :return: """ - result_files = Path(config.get('PATH_DATA')).glob("*") + result_files = Path(self.config.get('PATH_DATA')).glob("*") for file in result_files: if file.stem.startswith("."): # skip hidden files diff --git a/backend/workers/datasource_metrics.py b/backend/workers/datasource_metrics.py index dfc5579d8..3c9ef530d 100644 --- a/backend/workers/datasource_metrics.py +++ b/backend/workers/datasource_metrics.py @@ -13,7 +13,6 @@ from datetime import datetime, time, timezone from backend.lib.worker import BasicWorker -from common.config_manager import config class DatasourceMetrics(BasicWorker): @@ -52,9 +51,9 @@ def general_stats(self): this worker instead of on demand. """ metrics = { - "size_data": DatasourceMetrics.folder_size(config.get("PATH_DATA")), - "size_logs": DatasourceMetrics.folder_size(config.get("PATH_LOGS")), - "size_db": self.db.fetchone("SELECT pg_database_size(%s) AS num", (config.get("DB_NAME"),))["num"] + "size_data": DatasourceMetrics.folder_size(self.config.get("PATH_DATA")), + "size_logs": DatasourceMetrics.folder_size(self.config.get("PATH_LOGS")), + "size_db": self.db.fetchone("SELECT pg_database_size(%s) AS num", (self.config.get("DB_NAME"),))["num"] } for metric, value in metrics.items(): @@ -95,7 +94,7 @@ def data_stats(self): """) added_datasources = [row["datasource"] for row in self.db.fetchall("SELECT DISTINCT(datasource) FROM metrics")] - enabled_datasources = config.get("datasources.enabled", {}) + enabled_datasources = self.config.get("datasources.enabled", {}) for datasource_id in self.modules.datasources: if datasource_id not in enabled_datasources: @@ -121,7 +120,7 @@ def data_stats(self): elif datasource_id == "8chan": settings_id = "eightchan" - boards = [b for b in config.get(settings_id + "-search.boards", [])] + boards = [b for b in self.config.get(settings_id + "-search.boards", [])] # If a datasource is static (so not updated) and it # is already present in the metrics table, we don't diff --git a/backend/workers/expire_items.py b/backend/workers/expire_items.py index ed4d1cc0f..22e34e058 100644 --- a/backend/workers/expire_items.py +++ b/backend/workers/expire_items.py @@ -3,7 +3,6 @@ """ import datetime import time -import json import re from backend.lib.worker import BasicWorker @@ -11,6 +10,7 @@ from common.lib.exceptions import DataSetNotFoundException, WorkerInterruptedException from common.lib.user import User +from common.config_manager import ConfigWrapper class ThingExpirer(BasicWorker): @@ -58,9 +58,11 @@ def expire_datasets(self): if self.interrupted: raise WorkerInterruptedException("Interrupted while expiring datasets") + # the dataset creator's configuration context determines expiration + wrapper = ConfigWrapper(self.config, user=dataset["creator"]) try: dataset = DataSet(key=dataset["key"], db=self.db) - if dataset.is_expired(): + if dataset.is_expired(config=wrapper): self.log.info(f"Deleting dataset {dataset.key} (expired)") dataset.delete() diff --git a/backend/workers/restart_4cat.py b/backend/workers/restart_4cat.py index 1c0538f56..05956dc7f 100644 --- a/backend/workers/restart_4cat.py +++ b/backend/workers/restart_4cat.py @@ -14,7 +14,6 @@ from backend.lib.worker import BasicWorker from common.lib.exceptions import WorkerInterruptedException -from common.config_manager import config class FourcatRestarterAndUpgrader(BasicWorker): @@ -50,11 +49,11 @@ def work(self): # prevent multiple restarts running at the same time which could blow # up really fast - lock_file = Path(config.get("PATH_ROOT")).joinpath("config/restart.lock") + lock_file = Path(self.config.get("PATH_ROOT")).joinpath("config/restart.lock") # this file has the log of the restart worker itself and is checked by # the frontend to see how far we are - log_file_restart = Path(config.get("PATH_ROOT")).joinpath(config.get("PATH_LOGS")).joinpath("restart.log") + log_file_restart = Path(self.config.get("PATH_ROOT")).joinpath(self.config.get("PATH_LOGS")).joinpath("restart.log") log_stream_restart = log_file_restart.open("a") if not is_resuming: @@ -74,7 +73,7 @@ def work(self): if self.job.data["remote_id"].startswith("upgrade"): command = sys.executable + " helper-scripts/migrate.py --repository %s --yes --restart --output %s" % \ - (shlex.quote(config.get("4cat.github_url")), shlex.quote(str(log_file_restart))) + (shlex.quote(self.config.get("4cat.github_url")), shlex.quote(str(log_file_restart))) if self.job.details and self.job.details.get("branch"): # migrate to code in specific branch command += f" --branch {shlex.quote(self.job.details['branch'])}" @@ -100,7 +99,7 @@ def work(self): # restarts and we re-attempt to make a daemon, it will fail # when trying to close the stdin file descriptor of the # subprocess (man, that was a fun bug to hunt down) - process = subprocess.Popen(shlex.split(command), cwd=str(config.get("PATH_ROOT")), + process = subprocess.Popen(shlex.split(command), cwd=str(self.config.get("PATH_ROOT")), stdout=log_stream_restart, stderr=log_stream_restart, stdin=subprocess.DEVNULL) @@ -143,20 +142,20 @@ def work(self): # front-end restart or upgrade too self.log.info("Restart worker resumed after restarting 4CAT, restart successful.") log_stream_restart.write("4CAT restarted.\n") - with Path(config.get("PATH_ROOT")).joinpath("config/.current-version").open() as infile: + with Path(self.config.get("PATH_ROOT")).joinpath("config/.current-version").open() as infile: log_stream_restart.write(f"4CAT is now running version {infile.readline().strip()}.\n") # we're gonna use some specific Flask routes to trigger this, i.e. # we're interacting with the front-end through HTTP - api_host = "https://" if config.get("flask.https") else "http://" - if config.get("USING_DOCKER"): + api_host = "https://" if self.config.get("flask.https") else "http://" + if self.config.get("USING_DOCKER"): import os docker_exposed_port = os.environ['PUBLIC_PORT'] api_host += f"host.docker.internal{':' + docker_exposed_port if docker_exposed_port != '80' else ''}" else: - api_host += config.get("flask.server_name") + api_host += self.config.get("flask.server_name") - if self.job.data["remote_id"].startswith("upgrade") and config.get("USING_DOCKER"): + if self.job.data["remote_id"].startswith("upgrade") and self.config.get("USING_DOCKER"): # when using Docker, the front-end needs to update separately log_stream_restart.write("Telling front-end Docker container to upgrade...\n") log_stream_restart.close() # close, because front-end will be writing to it diff --git a/common/config_manager.py b/common/config_manager.py index eb6c846d0..77835f105 100644 --- a/common/config_manager.py +++ b/common/config_manager.py @@ -432,7 +432,12 @@ def __init__(self, config, user=None, tags=None, request=None): serve 4CAT with a different configuration based on the proxy server used. """ - self.config = config + if type(config) is ConfigWrapper: + # let's not do nested wrappers + self.config = config.config + else: + self.config = config + self.user = user self.tags = tags self.request = request diff --git a/common/lib/dataset.py b/common/lib/dataset.py index b092d2a4e..fdeab606d 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -1299,7 +1299,7 @@ def get_breadcrumbs(self): genealogy.append(self.key) return ",".join(genealogy) - def get_compatible_processors(self, user=None): + def get_compatible_processors(self, config=None): """ Get list of processors compatible with this dataset @@ -1308,9 +1308,10 @@ def get_compatible_processors(self, user=None): specify accepted types (via the `is_compatible_with` method), it is assumed it accepts any top-level datasets - :param str|User|None user: User to get compatibility for. If set, - use the user-specific config settings where available. - + :param ConfigManager|None config: Configuration reader to determine + compatibility through. This may not be the same reader the dataset was + instantiated with, e.g. when checking whether some other user should + be able to run processors on this dataset. :return dict: Compatible processors, `name => class` mapping """ processors = self.modules.processors @@ -1324,7 +1325,7 @@ def get_compatible_processors(self, user=None): # method returns True *or* if it has no explicit compatibility # check and this dataset is top-level (i.e. has no parent) if (not hasattr(processor, "is_compatible_with") and not self.key_parent) \ - or (hasattr(processor, "is_compatible_with") and processor.is_compatible_with(self, user=user)): + or (hasattr(processor, "is_compatible_with") and processor.is_compatible_with(self, config=config)): available[processor_type] = processor return available @@ -1381,7 +1382,7 @@ def get_own_processor(self): return self.modules.processors.get(processor_type) - def get_available_processors(self, user=None, exclude_hidden=False): + def get_available_processors(self, config=None, exclude_hidden=False): """ Get list of processors that may be run for this dataset @@ -1390,8 +1391,10 @@ def get_available_processors(self, user=None, exclude_hidden=False): run but have options are included so they may be run again with a different configuration - :param str|User|None user: User to get compatibility for. If set, - use the user-specific config settings where available. + :param ConfigManager|None config: Configuration reader to determine + compatibility through. This may not be the same reader the dataset was + instantiated with, e.g. when checking whether some other user should + be able to run processors on this dataset. :param bool exclude_hidden: Exclude processors that should be displayed in the UI? If `False`, all processors are returned. @@ -1402,13 +1405,13 @@ def get_available_processors(self, user=None, exclude_hidden=False): # TODO: could children also have been created? Possible bug, but I have not seen anything effected by this return {processor_type: processor for processor_type, processor in self.available_processors.items() if not exclude_hidden or not processor.is_hidden} - processors = self.get_compatible_processors(user=user) + processors = self.get_compatible_processors(config=config) for analysis in self.children: if analysis.type not in processors: continue - if not processors[analysis.type].get_options(): + if not processors[analysis.type].get_options(config=config): del processors[analysis.type] continue @@ -1484,15 +1487,14 @@ def is_top_dataset(self): return False return True - def is_expiring(self, user=None): + def is_expiring(self, config): """ Determine if dataset is set to expire Similar to `is_expired`, but checks if the dataset will be deleted in the future, not if it should be deleted right now. - :param user: User to use for configuration context. Provide to make - sure configuration overrides for this user are taken into account. + :param ConfigManager config: Configuration reader (context-aware) :return bool|int: `False`, or the expiration date as a Unix timestamp. """ # has someone opted out of deleting this? @@ -1504,7 +1506,7 @@ def is_expiring(self, user=None): return self.parameters.get("expires-after") # is the data source configured to have its datasets expire? - expiration = config.get("datasources.expiration", {}, user=user) + expiration = config.get("datasources.expiration", {}) if not expiration.get(self.parameters.get("datasource")): return False @@ -1514,19 +1516,18 @@ def is_expiring(self, user=None): return False - def is_expired(self, user=None): + def is_expired(self, config): """ Determine if dataset should be deleted Datasets can be set to expire, but when they should be deleted depends on a number of factor. This checks them all. - :param user: User to use for configuration context. Provide to make - sure configuration overrides for this user are taken into account. + :param ConfigManager config: Configuration reader (context-aware) :return bool: """ # has someone opted out of deleting this? - if not self.is_expiring(): + if not self.is_expiring(config): return False # is this dataset explicitly marked as expiring after a certain time? @@ -1535,7 +1536,7 @@ def is_expired(self, user=None): return True # is the data source configured to have its datasets expire? - expiration = config.get("datasources.expiration", {}, user=user) + expiration = config.get("datasources.expiration", {}) if not expiration.get(self.parameters.get("datasource")): return False diff --git a/common/lib/helpers.py b/common/lib/helpers.py index 2911044f5..8124ca073 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -332,6 +332,24 @@ def convert_to_int(value, default=0): return default +def convert_to_float(value, default=0.0): + """ + Convert a value to a float, with a fallback + + The fallback is used if an Error is thrown during converstion to float. + This is a convenience function, but beats putting try-catches everywhere + we're using user input as a float. + + :param value: Value to convert + :param float default: Default value, if conversion not possible + :return float: Converted value + """ + try: + return float(value) + except (ValueError, TypeError): + return default + + def timify_long(number): """ Make a number look like an indication of time diff --git a/common/lib/module_loader.py b/common/lib/module_loader.py index a036bd249..8ba24002f 100644 --- a/common/lib/module_loader.py +++ b/common/lib/module_loader.py @@ -28,6 +28,7 @@ class ModuleCollector: ignore = [] missing_modules = {} log_buffer = None + config = None PROCESSOR = 1 WORKER = 2 @@ -36,15 +37,20 @@ class ModuleCollector: processors = {} datasources = {} - def __init__(self, write_config=False): + def __init__(self, config, write_cache=False): """ Load data sources and workers Datasources are loaded first so that the datasource folders may be scanned for workers subsequently. + + :param config: Configuration manager, shared with the rest of the + context + :param bool write_cache: Write modules to cache file? """ # this can be flushed later once the logger is available self.log_buffer = "" + self.config = config self.load_datasources() self.load_modules() @@ -54,17 +60,17 @@ def __init__(self, write_config=False): self.expand_datasources() # cache module-defined config options for use by the config manager - if write_config: + if write_cache: module_config = {} for worker in self.workers.values(): if hasattr(worker, "config") and type(worker.config) is dict: module_config.update(worker.config) - with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: + with self.config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: pickle.dump(module_config, outfile) # load from cache - config.load_user_settings() + self.config.load_user_settings() @staticmethod def is_4cat_class(object, only_processors=False): @@ -98,15 +104,15 @@ def load_modules(self): """ # look for workers and processors in pre-defined folders and datasources - extension_path = Path(config.get('PATH_ROOT'), "extensions") + extension_path = Path(self.config.get('PATH_ROOT'), "extensions") - paths = [Path(config.get('PATH_ROOT'), "processors"), - Path(config.get('PATH_ROOT'), "backend", "workers"), + paths = [Path(self.config.get('PATH_ROOT'), "processors"), + Path(self.config.get('PATH_ROOT'), "backend", "workers"), extension_path, *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... - root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT')))) - root_path = Path(config.get('PATH_ROOT')) + root_match = re.compile(r"^%s" % re.escape(str(self.config.get('PATH_ROOT')))) + root_path = Path(self.config.get('PATH_ROOT')) for folder in paths: # loop through folders, and files in those folders, recursively @@ -192,7 +198,7 @@ def _load_datasource(subdirectory): Load a single datasource """ # determine module name (path relative to 4CAT root w/ periods) - module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts) + module_name = ".".join(subdirectory.relative_to(Path(self.config.get("PATH_ROOT"))).parts) try: datasource = importlib.import_module(module_name) except ImportError as e: @@ -206,7 +212,7 @@ def _load_datasource(subdirectory): datasource_id = datasource.DATASOURCE self.datasources[datasource_id] = { - "expire-datasets": config.get("datasources.expiration", {}).get(datasource_id, None), + "expire-datasets": self.config.get("datasources.expiration", {}).get(datasource_id, None), "path": subdirectory, "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id, "id": subdirectory.parts[-1], @@ -215,13 +221,13 @@ def _load_datasource(subdirectory): } # Load 4CAT core datasources - for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): + for subdirectory in Path(self.config.get('PATH_ROOT'), "datasources").iterdir(): if subdirectory.is_dir(): _load_datasource(subdirectory) # Load extension datasources # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders - for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True): + for root, dirs, files in os.walk(Path(self.config.get('PATH_ROOT'), "extensions"), followlinks=True): if "datasources" in dirs: for subdirectory in Path(root, "datasources").iterdir(): if subdirectory.is_dir(): @@ -243,7 +249,9 @@ def expand_datasources(self): worker = self.workers.get("%s-search" % datasource_id) self.datasources[datasource_id]["has_worker"] = bool(worker) self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ - bool(self.workers["%s-search" % datasource_id].get_options()) + bool(self.workers[ + "%s-search" % datasource_id].get_options( + config=self.config)) self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer def load_worker_class(self, worker): diff --git a/datasources/audio_to_text/audio_to_text.py b/datasources/audio_to_text/audio_to_text.py index 79ef43ed3..2ae3bd447 100644 --- a/datasources/audio_to_text/audio_to_text.py +++ b/datasources/audio_to_text/audio_to_text.py @@ -17,18 +17,18 @@ class AudioUploadToText(SearchMedia): description = "Upload your own audio and use OpenAI's Whisper model to create transcripts" # description displayed in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): #TODO: False here does not appear to actually remove the datasource from the "Create dataset" page so technically # this method is not necessary; if we can adjust that behavior, it ought to function as intended # Ensure the Whisper model is available - return AudioToText.is_compatible_with(module=module, user=user) + return AudioToText.is_compatible_with(module=module, config=config) @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, *args, **kwargs): # We need both sets of options for this datasource - media_options = SearchMedia.get_options(parent_dataset=parent_dataset, user=user) - whisper_options = AudioToText.get_options(parent_dataset=parent_dataset, user=user) + media_options = SearchMedia.get_options(*args, **kwargs) + whisper_options = AudioToText.get_options(*args, **kwargs) media_options.update(whisper_options) #TODO: there are some odd formatting issues if we use those derived options @@ -43,9 +43,9 @@ def get_options(cls, parent_dataset=None, user=None): return media_options @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): # We need SearchMedia's validate_query to upload the media - media_query = SearchMedia.validate_query(query, request, user) + media_query = SearchMedia.validate_query(query, request, config) # Here's the real trick: act like a preset and add another processor to the pipeline media_query["next"] = [{"type": "audio-to-text", diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py index c15540a50..d23a15f60 100644 --- a/datasources/bitchute/search_bitchute.py +++ b/datasources/bitchute/search_bitchute.py @@ -582,7 +582,7 @@ def request_from_bitchute(self, session, method, url, headers=None, data=None): return response - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate BitChute query input diff --git a/datasources/dmi-tcat/search_tcat.py b/datasources/dmi-tcat/search_tcat.py index 227125be6..b874eb07d 100644 --- a/datasources/dmi-tcat/search_tcat.py +++ b/datasources/dmi-tcat/search_tcat.py @@ -161,17 +161,17 @@ def collect_all_bins(cls, force_update=False): pass @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get data source options This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances. + :param config: :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can +can be used to show some options only to privileges users. """ options = cls.options @@ -507,13 +507,13 @@ def tcat_to_APIv2(tcat_tweet): return APIv2_tweet @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate DMI-TCAT query input :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # no query 4 u diff --git a/datasources/dmi-tcatv2/search_tcat_v2.py b/datasources/dmi-tcatv2/search_tcat_v2.py index 362d51f6d..82812ce57 100644 --- a/datasources/dmi-tcatv2/search_tcat_v2.py +++ b/datasources/dmi-tcatv2/search_tcat_v2.py @@ -73,17 +73,17 @@ class SearchWithinTCATBinsV2(Search): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get data source options This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances. + :param config: :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can +can be used to show some options only to privileges users. """ options = cls.options @@ -381,13 +381,13 @@ def collect_tcat_metadata(cls): return all_bins @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate DMI-TCAT query input :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # no query 4 u diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py index 0fb983fbe..0dcec9d94 100644 --- a/datasources/douban/search_douban.py +++ b/datasources/douban/search_douban.py @@ -242,13 +242,13 @@ def get_douban_url(self, url, **kwargs): return requests.get(url, **kwargs) - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate input for a dataset query on the Douban data source. :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ filtered_query = {} diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py index cd231b445..cdd3ada6f 100644 --- a/datasources/fourcat_import/import_4cat.py +++ b/datasources/fourcat_import/import_4cat.py @@ -341,7 +341,7 @@ def fetch_from_4cat(base, dataset_key, api_key, component, datapath=None): return response @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate custom data input @@ -350,7 +350,7 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ urls = query.get("url") diff --git a/datasources/fourchan/scrapers/download_images.py b/datasources/fourchan/scrapers/download_images.py index fab9f75a1..a429ca311 100644 --- a/datasources/fourchan/scrapers/download_images.py +++ b/datasources/fourchan/scrapers/download_images.py @@ -29,7 +29,7 @@ def work(self): """ try: url = "http://i.4cdn.org/%s/%s%s" % (self.job.details["board"], self.job.details["tim"], self.job.details["ext"]) - image = requests.get(url, timeout=config.get('SCRAPE_TIMEOUT') * 3) + image = requests.get(url, timeout=self.config.get('SCRAPE_TIMEOUT') * 3) except (requests.exceptions.RequestException, ConnectionRefusedError) as e: # something wrong with our internet connection? or blocked by 4chan? # try again in a minute diff --git a/datasources/fourchan/scrapers/scrape_threads.py b/datasources/fourchan/scrapers/scrape_threads.py index af2d0992f..7aeecacdf 100644 --- a/datasources/fourchan/scrapers/scrape_threads.py +++ b/datasources/fourchan/scrapers/scrape_threads.py @@ -220,7 +220,7 @@ def save_post(self, post, thread, first_post): self.log.error("ValueError (%s) during scrape of thread %s" % (e, post["no"])) # Download images (exclude .webm files) - if "filename" in post and post["ext"] != ".webm" and config.get("fourchan-search.save_images"): + if "filename" in post and post["ext"] != ".webm" and self.config.get("fourchan-search.save_images"): self.queue_image(post, thread) return return_value @@ -241,11 +241,11 @@ def queue_image(self, post, thread): md5 = hashlib.md5() md5.update(base64.b64decode(post["md5"])) - image_folder = config.get('PATH_ROOT').joinpath(config.get('PATH_IMAGES')) + image_folder = self.config.get('PATH_ROOT').joinpath(self.config.get('PATH_IMAGES')) image_path = image_folder.joinpath(md5.hexdigest() + post["ext"]) - if config.get('PATH_IMAGES') and image_folder.is_dir() and not image_path.is_file(): - claimtime = int(time.time()) + int(config.get("fourchan-search.image_interval")) + if self.config.get('PATH_IMAGES') and image_folder.is_dir() and not image_path.is_file(): + claimtime = int(time.time()) + int(self.config.get("fourchan-search.image_interval")) try: self.queue.add_job("fourchan-image", remote_id=post["md5"], claim_after=claimtime, details={ diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py index 17694badc..8f5446d3c 100644 --- a/datasources/fourchan/search_4chan.py +++ b/datasources/fourchan/search_4chan.py @@ -564,7 +564,7 @@ def get_items_complex(self, query): postgres_join = "" # Option wether to use sphinx for text searches - use_sphinx = config.get("fourchan.use_sphinx", True) + use_sphinx = self.config.get("fourchan.use_sphinx", True) if query.get("min_date", None): try: @@ -808,9 +808,9 @@ def get_sphinx_handler(self): :return MySQLDatabase: """ return MySQLDatabase( - host=config.get("4cat.sphinx_host"), - user=config.get('DB_USER'), - password=config.get('DB_PASSWORD'), + host=self.config.get("4cat.sphinx_host"), + user=self.config.get('DB_USER'), + password=self.config.get('DB_PASSWORD'), port=9306, logger=self.log ) @@ -831,7 +831,7 @@ def get_thread_sizes(self, thread_ids, min_length): return thread_sizes - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate input for a dataset query on the 4chan data source. @@ -841,12 +841,12 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set - if not user.is_admin and not config.get("fourchan-search.can_query_without_keyword", False, user=user) \ + if not config.get("fourchan-search.can_query_without_keyword", False) \ and not query.get("body_match", None) \ and not query.get("subject_match", None) \ and query.get("search_scope", "") != "random-sample" \ diff --git a/datasources/media_import/import_media.py b/datasources/media_import/import_media.py index d45016efd..6d06000ac 100644 --- a/datasources/media_import/import_media.py +++ b/datasources/media_import/import_media.py @@ -30,7 +30,7 @@ class SearchMedia(BasicProcessor): accepted_file_types = ["audio", "video", "image"] @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): return { "intro": { "type": UserInput.OPTION_INFO, @@ -49,7 +49,7 @@ def get_options(cls, parent_dataset=None, user=None): } @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Step 1: Validate query and files @@ -57,8 +57,7 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query - :return dict: Safe query parameters + :param ConfigManager|None config: Configuration reader (context-aware) """ # do we have uploaded files? bad_files = [] diff --git a/datasources/reddit/search_reddit.py b/datasources/reddit/search_reddit.py index 1d64c4a83..3c3113023 100644 --- a/datasources/reddit/search_reddit.py +++ b/datasources/reddit/search_reddit.py @@ -57,15 +57,16 @@ class SearchReddit(Search): after = "after" @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Determine if user needs to see the 'careful with wildcard queries!' - warning - - :param parent_dataset: - :param user: - :return dict: Options definition - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Determine if user needs to see the 'careful with wildcard queries!' + warning + + :param config: + :param parent_dataset: + :param user: + :return dict: Options definition + """ options = { "wildcard-warning": { "type": UserInput.OPTION_INFO, @@ -518,7 +519,7 @@ def wait_until_window(self): # clean up timestamps outside of window self.request_timestamps = [timestamp for timestamp in self.request_timestamps if timestamp >= window_start] - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate input for a dataset query on the 4chan data source. @@ -528,7 +529,7 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # we need a board! @@ -541,10 +542,10 @@ def validate_query(query, request, user): # ignore leading r/ for boards query["board"] = ",".join(boards) - keywordless_query = config.get("reddit-search.can_query_without_keyword", False, user=user) + keywordless_query = config.get("reddit-search.can_query_without_keyword", False) # this is the bare minimum, else we can't narrow down the full data set - if not user.is_admin and not keywordless_query and not query.get( + if not keywordless_query and not query.get( "body_match", "").strip() and not query.get("subject_match", "").strip() and not query.get( "subject_url", ""): raise QueryParametersException("Please provide a body query or subject query.") diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 55c3a61b7..69e8a9f35 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -80,7 +80,7 @@ class SearchTelegram(Search): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -89,11 +89,10 @@ def get_options(cls, parent_dataset=None, user=None): :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. + :param ConfigManager|None config: Configuration reader (context-aware) """ - max_entities = config.get("telegram-search.max_entities", 25, user=user) + + max_entities = config.get("telegram-search.max_entities", 25) options = { "intro": { "type": UserInput.OPTION_INFO, @@ -177,7 +176,7 @@ def get_options(cls, parent_dataset=None, user=None): options["query-intro"]["help"] = (f"You can collect messages from up to **{max_entities:,}** entities " f"(channels or groups) at a time. Separate with line breaks or commas.") - all_messages = config.get("telegram-search.can_query_all_messages", False, user=user) + all_messages = config.get("telegram-search.can_query_all_messages", False) if all_messages: if "max" in options["max_posts"]: del options["max_posts"]["max"] @@ -185,7 +184,7 @@ def get_options(cls, parent_dataset=None, user=None): options["max_posts"]["help"] = (f"Messages to collect per entity. You can query up to " f"{options['max_posts']['max']:,} messages per entity.") - if config.get("telegram-search.max_crawl_depth", 0, user=user) > 0: + if config.get("telegram-search.max_crawl_depth", 0) > 0: options["crawl_intro"] = { "type": UserInput.OPTION_INFO, "help": "Optionally, 4CAT can 'discover' new entities via forwarded messages; for example, if a " @@ -263,7 +262,7 @@ async def execute_queries(self): session_id = SearchTelegram.create_session_id(query["api_phone"], query["api_id"], query["api_hash"]) self.dataset.log(f'Telegram session id: {session_id}') - session_path = Path(config.get("PATH_ROOT")).joinpath(config.get("PATH_SESSIONS"), session_id + ".session") + session_path = Path(self.config.get("PATH_ROOT")).joinpath(self.config.get("PATH_SESSIONS"), session_id + ".session") client = None @@ -847,13 +846,15 @@ def serialize_obj(input_obj): return mapped_obj @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate Telegram query + :param config: :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query + :param ConfigManager config: Configuration reader (context-aware) :return dict: Safe query parameters """ # no query 4 u @@ -863,10 +864,11 @@ def validate_query(query, request, user): if not query.get("api_id", None) or not query.get("api_hash", None) or not query.get("api_phone", None): raise QueryParametersException("You need to provide valid Telegram API credentials first.") - all_posts = config.get("telegram-search.can_query_all_messages", False, user=user) - max_entities = config.get("telegram-search.max_entities", 25, user=user) + all_posts = config.get("telegram-search.can_query_all_messages", False) + max_entities = config.get("telegram-search.max_entities", 25) - num_items = query.get("max_posts") if all_posts else min(query.get("max_posts"), SearchTelegram.get_options()["max_posts"]["max"]) + num_items = query.get("max_posts") if all_posts else min(query.get("max_posts"), SearchTelegram.get_options( + config=config)["max_posts"]["max"]) # reformat queries to be a comma-separated list with no wrapping # whitespace @@ -889,14 +891,14 @@ def validate_query(query, request, user): min_date, max_date = query.get("daterange") # now check if there is an active API session - if not user or not user.is_authenticated or user.is_anonymous: + if not hasattr(config, "user") or not config.user.is_authenticated or config.user.is_anonymous: raise QueryParametersException("Telegram scraping is only available to logged-in users with personal " "accounts.") # check for the information we need session_id = SearchTelegram.create_session_id(query.get("api_phone"), query.get("api_id"), query.get("api_hash")) - user.set_value("telegram.session", session_id) + config.user.set_value("telegram.session", session_id) session_path = Path(config.get('PATH_ROOT')).joinpath(config.get('PATH_SESSIONS'), session_id + ".session") client = None diff --git a/datasources/tiktok_urls/search_tiktok_urls.py b/datasources/tiktok_urls/search_tiktok_urls.py index d8864be91..bfbe82ceb 100644 --- a/datasources/tiktok_urls/search_tiktok_urls.py +++ b/datasources/tiktok_urls/search_tiktok_urls.py @@ -77,13 +77,13 @@ def get_items(self, query): return loop.run_until_complete(tiktok_scraper.request_metadata(query["urls"].split(","))) @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate TikTok query :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # reformat queries to be a comma-separated list with no wrapping diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 0ce4328dc..506c47370 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -77,9 +77,10 @@ class SearchTumblr(Search): references = ["[Tumblr API documentation](https://www.tumblr.com/docs/en/api/v2)"] @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Check is Tumbler keys configured and if not, requests from User + :param config: """ options = { "intro": { @@ -114,7 +115,7 @@ def get_options(cls, parent_dataset=None, user=None): } try: - config_keys = SearchTumblr.get_tumbler_keys(user) + config_keys = SearchTumblr.get_tumbler_keys(config) except ConfigException: # No 4CAT set keys for user; let user input their own options["key-info"] = { @@ -666,12 +667,12 @@ def get_post_by_id(self, blog_name, post_id): return result @staticmethod - def get_tumbler_keys(user): + def get_tumbler_keys(config): config_keys = [ - config.get("api.tumblr.consumer_key", user=user), - config.get("api.tumblr.consumer_secret", user=user), - config.get("api.tumblr.key", user=user), - config.get("api.tumblr.secret_key", user=user)] + config.get("api.tumblr.consumer_key"), + config.get("api.tumblr.consumer_secret"), + config.get("api.tumblr.key"), + config.get("api.tumblr.secret_key")] if not all(config_keys): raise ConfigException("Not all Tumblr API credentials are configured. Cannot query Tumblr API.") return config_keys @@ -688,7 +689,7 @@ def connect_to_tumblr(self): self.parameters.get("secret_key")] if not all(config_keys): # No user input keys; attempt to use 4CAT config keys - config_keys = self.get_tumbler_keys(self.owner) + config_keys = self.get_tumbler_keys(self.config) self.client = pytumblr.TumblrRestClient(*config_keys) @@ -701,7 +702,7 @@ def connect_to_tumblr(self): return self.client - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate custom data input @@ -710,7 +711,7 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # no query 4 u diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py index 999680b6e..f75d2cf4b 100644 --- a/datasources/twitterv2/search_twitter.py +++ b/datasources/twitterv2/search_twitter.py @@ -150,7 +150,7 @@ def get_items(self, query): tweets = 0 for query in queries: - if self.parameters.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup"): + if self.parameters.get("query_type", "query") == "id_lookup" and self.config.get("twitterv2-search.id_lookup"): params['ids'] = query else: params['query'] = query @@ -458,7 +458,7 @@ def fix_tweet_error(self, tweet_error): return modified_tweet @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get Twitter data source options @@ -466,12 +466,12 @@ def get_options(cls, parent_dataset=None, user=None): may not need to provide their own API key, and may or may not be able to enter a list of tweet IDs as their query. Hence the method. + :param config: :param parent_dataset: Should always be None - :param user: User to provide options for :return dict: Data source options """ - have_api_key = config.get("twitterv2-search.academic_api_key", user=user) - max_tweets = config.get("twitterv2-search.max_tweets", user=user) + have_api_key = config.get("twitterv2-search.academic_api_key") + max_tweets = config.get("twitterv2-search.max_tweets") if have_api_key: intro_text = ("This data source uses the full-archive search endpoint of the Twitter API (v2) to retrieve " @@ -518,7 +518,7 @@ def get_options(cls, parent_dataset=None, user=None): }, }) - if config.get("twitterv2.id_lookup", user=user): + if config.get("twitterv2.id_lookup"): options.update({ "query_type": { "type": UserInput.OPTION_CHOICE, @@ -562,7 +562,7 @@ def get_options(cls, parent_dataset=None, user=None): return options @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate input for a dataset query on the Twitter data source. @@ -576,11 +576,11 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ - have_api_key = config.get("twitterv2-search.academic_api_key", user=user) - max_tweets = config.get("twitterv2-search.max_tweets", 10_000_000, user=user) + have_api_key = config.get("twitterv2-search.academic_api_key") + max_tweets = config.get("twitterv2-search.max_tweets", 10_000_000) # this is the bare minimum, else we can't narrow down the full data set if not query.get("query", None): @@ -593,7 +593,7 @@ def validate_query(query, request, user): if len(query.get("query")) > 1024 and query.get("query_type", "query") != "id_lookup": raise QueryParametersException("Twitter API queries cannot be longer than 1024 characters.") - if query.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup", user=user): + if query.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup"): # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"\s+") diff --git a/datasources/upload/import_csv.py b/datasources/upload/import_csv.py index 53a91cde0..7e9045a57 100644 --- a/datasources/upload/import_csv.py +++ b/datasources/upload/import_csv.py @@ -185,7 +185,7 @@ def process(self): else: self.dataset.finish(done) - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate custom data input @@ -194,7 +194,7 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # do we have an uploaded file? diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py index d04daba0a..b73303981 100644 --- a/datasources/vk/search_vk.py +++ b/datasources/vk/search_vk.py @@ -34,12 +34,12 @@ class SearchVK(Search): expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count" # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get VK data source options + :param config: :param parent_dataset: Should always be None - :param user: User to provide options for :return dict: Data source options """ @@ -177,7 +177,7 @@ def login(self, username, password): """ vk_session = vk_api.VkApi(username, password, - config_filename=Path(config.get("PATH_ROOT")).joinpath(config.get("PATH_SESSIONS"), username+"-vk_config.json")) + config_filename=Path(self.config.get("PATH_ROOT")).joinpath(self.config.get("PATH_SESSIONS"), username+"-vk_config.json")) vk_session.auth() return vk_session @@ -328,7 +328,7 @@ def expand_profile_fields(dict_of_profile_types): return author_types @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate input for a dataset query on the VK data source. @@ -337,7 +337,7 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager|None config: Configuration reader (context-aware) :return dict: Safe query parameters """ # Please provide something... diff --git a/processors/audio/audio_extractor.py b/processors/audio/audio_extractor.py index f2d561974..cb446a7d4 100644 --- a/processors/audio/audio_extractor.py +++ b/processors/audio/audio_extractor.py @@ -36,19 +36,22 @@ class AudioExtractor(BasicProcessor): followups = ["audio-to-text"] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on videos only + + :param ConfigManager|None config: Configuration reader (context-aware) """ return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path", user=user) and \ + config.get("video-downloader.ffmpeg_path") and \ shutil.which(config.get("video-downloader.ffmpeg_path")) @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Collect maximum number of audio files from configuration and update options accordingly - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Collect maximum number of audio files from configuration and update options accordingly + :param config: + """ options = { "amount": { "type": UserInput.OPTION_TEXT, diff --git a/processors/conversion/clarifai_to_csv.py b/processors/conversion/clarifai_to_csv.py index 23b36eedb..d27fb3cff 100644 --- a/processors/conversion/clarifai_to_csv.py +++ b/processors/conversion/clarifai_to_csv.py @@ -28,11 +28,12 @@ class ConvertClarifaiOutputToCSV(BasicProcessor): extension = "csv" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible :param module: Module determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "clarifai-api" diff --git a/processors/conversion/consolidate_urls.py b/processors/conversion/consolidate_urls.py index 53e62fc6d..7f5d0d9bc 100644 --- a/processors/conversion/consolidate_urls.py +++ b/processors/conversion/consolidate_urls.py @@ -222,9 +222,10 @@ class ConsolidateURLs(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Update "columns" option with parent dataset columns + :param config: """ options = cls.options # Get the columns for the select columns option @@ -238,11 +239,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ This is meant to be inherited by other child classes :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ["csv", "ndjson"] diff --git a/processors/conversion/csv_to_excel.py b/processors/conversion/csv_to_excel.py index fe8139748..e03bec1dd 100644 --- a/processors/conversion/csv_to_excel.py +++ b/processors/conversion/csv_to_excel.py @@ -23,11 +23,12 @@ class ConvertCSVToMacExcel(BasicProcessor): extension = "csv" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ if module.type == cls.type: return False diff --git a/processors/conversion/csv_to_json.py b/processors/conversion/csv_to_json.py index a967edf07..aef900379 100644 --- a/processors/conversion/csv_to_json.py +++ b/processors/conversion/csv_to_json.py @@ -21,11 +21,12 @@ class ConvertCSVToJSON(BasicProcessor): extension = "json" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with a dataset or processor :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() == "csv" diff --git a/processors/conversion/extract_urls.py b/processors/conversion/extract_urls.py index bb76db569..9d6f0ec33 100644 --- a/processors/conversion/extract_urls.py +++ b/processors/conversion/extract_urls.py @@ -190,16 +190,19 @@ class ExtractURLs(BasicProcessor): ) @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ All processor on CSV and NDJSON datasets + + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ["csv", "ndjson"] @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Update "columns" option with parent dataset columns + :param config: """ options = cls.options # Get the columns for the select columns option diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py index 860c0ddbe..3477cbd7e 100644 --- a/processors/conversion/merge_datasets.py +++ b/processors/conversion/merge_datasets.py @@ -51,11 +51,12 @@ class DatasetMerger(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on any top-level CSV or NDJSON file :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector()) diff --git a/processors/conversion/ndjson_to_csv.py b/processors/conversion/ndjson_to_csv.py index c7f46cf3c..efcb0bd07 100644 --- a/processors/conversion/ndjson_to_csv.py +++ b/processors/conversion/ndjson_to_csv.py @@ -27,11 +27,12 @@ class ConvertNDJSONtoCSV(BasicProcessor): extension = "csv" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() == "ndjson" diff --git a/processors/conversion/split_by_thread.py b/processors/conversion/split_by_thread.py index 1d7677fc3..02c5c947b 100644 --- a/processors/conversion/split_by_thread.py +++ b/processors/conversion/split_by_thread.py @@ -26,11 +26,12 @@ class ThreadSplitter(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.parameters.get("datasource") in ("4chan", "8chan", "reddit", "breitbart") diff --git a/processors/conversion/stringify.py b/processors/conversion/stringify.py index 21835037a..f02c75689 100644 --- a/processors/conversion/stringify.py +++ b/processors/conversion/stringify.py @@ -46,11 +46,12 @@ class Stringify(BasicProcessor): } @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility; this processor is only compatible with top datasets in CSV or NDJSON format. :param str module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/conversion/tcat_auto_upload.py b/processors/conversion/tcat_auto_upload.py index 53a6d7be6..e17de21b9 100644 --- a/processors/conversion/tcat_auto_upload.py +++ b/processors/conversion/tcat_auto_upload.py @@ -61,7 +61,7 @@ class FourcatToDmiTcatUploader(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset @@ -69,15 +69,16 @@ def is_compatible_with(cls, module=None, user=None): TCAT-compatible file. :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "convert-ndjson-for-tcat" and \ - config.get('tcat-auto-upload.server_url', user=user) and \ - config.get('tcat-auto-upload.token', user=user) and \ - config.get('tcat-auto-upload.username', user=user) and \ - config.get('tcat-auto-upload.password', user=user) + config.get('tcat-auto-upload.server_url') and \ + config.get('tcat-auto-upload.token') and \ + config.get('tcat-auto-upload.username') and \ + config.get('tcat-auto-upload.password') @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -85,20 +86,20 @@ def get_options(cls, parent_dataset=None, user=None): TCAT servers are configured. Otherwise, no options are given since there is nothing to choose. + :param config: :param DataSet parent_dataset: Dataset that will be uploaded - :param User user: User that will be uploading it - :return dict: Option definition - """ - if config.get('tcat-auto-upload.server_url', user=user) \ - and type(config.get('tcat-auto-upload.server_url', user=user)) in (set, list, tuple) \ - and len(config.get('tcat-auto-upload.server_url', user=user)) > 1: + + + if config.get('tcat-auto-upload.server_url') \ + and type(config.get('tcat-auto-upload.server_url')) in (set, list, tuple) \ + and len(config.get('tcat-auto-upload.server_url')) > 1: return { "server": { "type": UserInput.OPTION_CHOICE, "options": { "random": "Choose one based on available capacity", **{ - url: url for url in config.get('tcat-auto-upload.server_url', user=user) + url: url for url in config.get('tcat-auto-upload.server_url') } }, "default": "random", diff --git a/processors/conversion/twitter_ndjson_to_tcat_json.py b/processors/conversion/twitter_ndjson_to_tcat_json.py index 24c4bab9c..96cb82519 100644 --- a/processors/conversion/twitter_ndjson_to_tcat_json.py +++ b/processors/conversion/twitter_ndjson_to_tcat_json.py @@ -23,7 +23,7 @@ class ConvertNDJSONToJSON(BasicProcessor): followups = ["tcat-auto-upload"] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset diff --git a/processors/conversion/view_metadata.py b/processors/conversion/view_metadata.py index e8b27c87b..ae465ac0e 100644 --- a/processors/conversion/view_metadata.py +++ b/processors/conversion/view_metadata.py @@ -37,11 +37,12 @@ class ViewMetadata(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type.startswith("video-downloader") or module.type.startswith("image-downloader") diff --git a/processors/conversion/vision_api_to_csv.py b/processors/conversion/vision_api_to_csv.py index 1319f16af..b7c40aa9c 100644 --- a/processors/conversion/vision_api_to_csv.py +++ b/processors/conversion/vision_api_to_csv.py @@ -28,11 +28,12 @@ class ConvertVisionOutputToCSV(BasicProcessor): extension = "csv" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "google-vision-api" diff --git a/processors/filtering/accent_fold.py b/processors/filtering/accent_fold.py index f3803c198..ff6a851c6 100644 --- a/processors/filtering/accent_fold.py +++ b/processors/filtering/accent_fold.py @@ -54,11 +54,12 @@ class AccentFoldingFilter(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on iterable files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ["csv",] @@ -117,7 +118,7 @@ def after_process(self): self.create_standalone() @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -126,10 +127,10 @@ def get_options(cls, parent_dataset=None, user=None): fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters. + :param config: :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can +can be used to show some options only to privileges users. """ options = cls.options diff --git a/processors/filtering/base_filter.py b/processors/filtering/base_filter.py index 5d9deff71..b9634fb8a 100644 --- a/processors/filtering/base_filter.py +++ b/processors/filtering/base_filter.py @@ -25,11 +25,12 @@ class BaseFilter(BasicProcessor): description = "This should not be available." @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ This is meant to be inherited by other child classes :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return False diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py index 2dc73b63e..c5f1c87a2 100644 --- a/processors/filtering/column_filter.py +++ b/processors/filtering/column_filter.py @@ -73,16 +73,17 @@ class ColumnFilter(BaseFilter): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on top datasets that are CSV or NDJSON. :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): options = cls.options if not parent_dataset: @@ -260,11 +261,12 @@ class ColumnProcessorFilter(ColumnFilter): description = "A generic filter that checks whether a value in a selected column matches a custom requirement. " @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on child datasets and do not create a standalone dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager config: Configuration reader (context-aware) """ return not module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/filtering/date_filter.py b/processors/filtering/date_filter.py index 5e14b03e4..27bbe346a 100644 --- a/processors/filtering/date_filter.py +++ b/processors/filtering/date_filter.py @@ -30,11 +30,12 @@ class DateFilter(BaseFilter): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on NDJSON and CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/filtering/lexical_filter.py b/processors/filtering/lexical_filter.py index 285d602b8..feb0ecdc3 100644 --- a/processors/filtering/lexical_filter.py +++ b/processors/filtering/lexical_filter.py @@ -66,11 +66,12 @@ class LexicalFilter(BaseFilter): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on NDJSON and CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/filtering/random_filter.py b/processors/filtering/random_filter.py index b29b5b450..e636ddb33 100644 --- a/processors/filtering/random_filter.py +++ b/processors/filtering/random_filter.py @@ -31,11 +31,12 @@ class RandomFilter(BaseFilter): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on NDJSON and CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/filtering/reddit_get_votes.py b/processors/filtering/reddit_get_votes.py index 52d46d10c..9e02a74a6 100644 --- a/processors/filtering/reddit_get_votes.py +++ b/processors/filtering/reddit_get_votes.py @@ -46,13 +46,14 @@ class RedditVoteChecker(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor if dataset is a Reddit dataset :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ - if config.get('api.reddit.client_id', False, user=user) and config.get('api.reddit.secret', False, user=user): + if config.get('api.reddit.client_id', False) and config.get('api.reddit.secret', False): return module.is_top_dataset() and module.type == "reddit-search" and module.num_rows <= 5000 return False diff --git a/processors/filtering/remove_author_info.py b/processors/filtering/remove_author_info.py index 10603ca8c..1b31e8ee6 100644 --- a/processors/filtering/remove_author_info.py +++ b/processors/filtering/remove_author_info.py @@ -36,16 +36,17 @@ class AuthorInfoRemover(BasicProcessor): ] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ["csv", 'ndjson'] @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): options = { "mode": { "help": "Filtering mode", diff --git a/processors/filtering/tiktok_refresh.py b/processors/filtering/tiktok_refresh.py index 253b001d9..22d3965a1 100644 --- a/processors/filtering/tiktok_refresh.py +++ b/processors/filtering/tiktok_refresh.py @@ -22,11 +22,12 @@ class UpdateTikTok(BasicProcessor): extension = "ndjson" @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on NDJSON and CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["tiktok-search", "tiktok-urls-search"] diff --git a/processors/filtering/unique_filter.py b/processors/filtering/unique_filter.py index 470d2345b..0af66e4a3 100644 --- a/processors/filtering/unique_filter.py +++ b/processors/filtering/unique_filter.py @@ -51,11 +51,12 @@ class UniqueFilter(BaseFilter): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on NDJSON and CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @@ -123,21 +124,22 @@ def filter_items(self): processed += 1 @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - This method by default returns the class's "options" attribute, or an - empty dictionary. It can be redefined by processors that need more - fine-grained options, e.g. in cases where the availability of options - is partially determined by the parent dataset's parameters. - - :param DataSet parent_dataset: An object representing the dataset that - the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Get processor options + + This method by default returns the class's "options" attribute, or an + empty dictionary. It can be redefined by processors that need more + fine-grained options, e.g. in cases where the availability of options + is partially determined by the parent dataset's parameters. + + :param config: + :param DataSet parent_dataset: An object representing the dataset that + the processor would be run on + :param User user: Flask user the options will be displayed for, in + case they are requested for display in the 4CAT web interface. This can + be used to show some options only to privileges users. + """ options = cls.options # Get the columns for the select columns option diff --git a/processors/filtering/wildcard_filter.py b/processors/filtering/wildcard_filter.py index a745141dc..467e84c6a 100644 --- a/processors/filtering/wildcard_filter.py +++ b/processors/filtering/wildcard_filter.py @@ -32,11 +32,12 @@ class WildcardFilter(BaseFilter): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on NDJSON and CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/filtering/write_annotations.py b/processors/filtering/write_annotations.py index 7fb8f2ee4..6d6b2ccab 100644 --- a/processors/filtering/write_annotations.py +++ b/processors/filtering/write_annotations.py @@ -28,11 +28,12 @@ class WriteAnnotations(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on CSV files :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py index 954963de4..e3d6c4b9b 100644 --- a/processors/machine_learning/annotate_text.py +++ b/processors/machine_learning/annotate_text.py @@ -85,15 +85,15 @@ class TextClassifier(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options These are dynamic for this processor: the 'column names' option is populated with the column names from the parent dataset, if available. + :param config: :param DataSet parent_dataset: Parent dataset - :param user: Flask User to which the options are shown, if applicable :return dict: Processor options """ options = cls.options @@ -112,12 +112,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on datasets with columns (from which a prompt can be retrieved) """ - return config.get("dmi-service-manager.stormtrooper_enabled", False, user=user) and \ - config.get("dmi-service-manager.ab_server_address", False, user=user) and \ + return config.get("dmi-service-manager.stormtrooper_enabled", False) and \ + config.get("dmi-service-manager.ab_server_address", False) and \ module.get_columns() def process(self): @@ -247,7 +247,7 @@ def make_filename(id, prompt): self.dataset.finish(len(annotations)) @staticmethod - def validate_query(query, request, user): + def validate_query(query, request, config): """ Validate input for a dataset query on the 4chan data source. @@ -257,7 +257,7 @@ def validate_query(query, request, user): :param dict query: Query parameters, from client-side. :param request: Flask request - :param User user: User object of user who has submitted the query + :param ConfigManager config: Configuration reader (context-aware) :return dict: Safe query parameters """ diff --git a/processors/machine_learning/blip2_image_caption.py b/processors/machine_learning/blip2_image_caption.py index 0287c2b5b..3eb265fc8 100644 --- a/processors/machine_learning/blip2_image_caption.py +++ b/processors/machine_learning/blip2_image_caption.py @@ -57,18 +57,19 @@ class CategorizeImagesCLIP(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on image archives if enabled in Control Panel """ - return config.get("dmi-service-manager.fc_blip2_enabled", False, user=user) and \ - config.get("dmi-service-manager.ab_server_address", False, user=user) and \ + return config.get("dmi-service-manager.fc_blip2_enabled", False) and \ + config.get("dmi-service-manager.ab_server_address", False) and \ (module.get_media_type() == "image" or module.type.startswith("image-downloader")) @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Collect maximum number of files from configuration and update options accordingly + :param config: """ options = { "amount": { @@ -92,7 +93,7 @@ def get_options(cls, parent_dataset=None, user=None): } # Update the amount max and help from config - max_number_images = int(config.get("dmi-service-manager.fd_blip2_num_files", 100, user=user)) + max_number_images = int(config.get("dmi-service-manager.fd_blip2_num_files", 100)) if max_number_images == 0: # Unlimited allowed options["amount"]["help"] = "Number of images" options["amount"]["default"] = 100 diff --git a/processors/machine_learning/clip_categorize_images.py b/processors/machine_learning/clip_categorize_images.py index f9be0b08a..3a7538cf2 100644 --- a/processors/machine_learning/clip_categorize_images.py +++ b/processors/machine_learning/clip_categorize_images.py @@ -57,18 +57,19 @@ class CategorizeImagesCLIP(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on image archives if enabled in Control Panel """ - return config.get("dmi-service-manager.cc_clip_enabled", False, user=user) and \ - config.get("dmi-service-manager.ab_server_address", False, user=user) and \ + return config.get("dmi-service-manager.cc_clip_enabled", False) and \ + config.get("dmi-service-manager.ab_server_address", False) and \ (module.get_media_type() == "image" or module.type.startswith("image-downloader")) @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Collect maximum number of files from configuration and update options accordingly + :param config: """ options = { "amount": { @@ -101,7 +102,7 @@ def get_options(cls, parent_dataset=None, user=None): } # Update the amount max and help from config - max_number_images = int(config.get("dmi-service-manager.cd_clip_num_files", 100, user=user)) + max_number_images = int(config.get("dmi-service-manager.cd_clip_num_files", 100)) if max_number_images == 0: # Unlimited allowed options["amount"]["help"] = "Number of images" options["amount"]["default"] = 100 diff --git a/processors/machine_learning/generate_images.py b/processors/machine_learning/generate_images.py index ef01453e5..048ca7139 100644 --- a/processors/machine_learning/generate_images.py +++ b/processors/machine_learning/generate_images.py @@ -81,15 +81,15 @@ class StableDiffusionImageGenerator(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options These are dynamic for this processor: the 'column names' option is populated with the column names from the parent dataset, if available. + :param config: :param DataSet parent_dataset: Parent dataset - :param user: Flask User to which the options are shown, if applicable :return dict: Processor options """ options = cls.options @@ -123,12 +123,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on datasets with columns (from which a prompt can be retrieved) """ - return config.get("dmi-service-manager.sd_enabled", False, user=user) and \ - config.get("dmi-service-manager.ab_server_address", False, user=user) and \ + return config.get("dmi-service-manager.sd_enabled", False) and \ + config.get("dmi-service-manager.ab_server_address", False) and \ module.get_columns() @staticmethod diff --git a/processors/machine_learning/pix-plot.py b/processors/machine_learning/pix-plot.py index 62186a053..69ded2292 100644 --- a/processors/machine_learning/pix-plot.py +++ b/processors/machine_learning/pix-plot.py @@ -65,7 +65,7 @@ class PixPlotGenerator(BasicProcessor): @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): # Update the amount max and help from config options = { "amount": { @@ -121,7 +121,7 @@ def get_options(cls, parent_dataset=None, user=None): }, } - max_number_images = int(config.get("dmi-service-manager.dc_pixplot_num_files", 10000, user=user)) + max_number_images = int(config.get("dmi-service-manager.dc_pixplot_num_files", 10000)) if max_number_images == 0: options["amount"]["help"] = options["amount"]["help"] + " (max: all available)" options["amount"]["min"] = 0 @@ -134,15 +134,15 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets; Checks if pix-plot.server_url set :param module: Dataset or processor to determine compatibility with """ - return config.get("dmi-service-manager.db_pixplot_enabled", False, user=user) and \ - config.get("dmi-service-manager.ab_server_address", False, user=user) and \ + return config.get("dmi-service-manager.db_pixplot_enabled", False) and \ + config.get("dmi-service-manager.ab_server_address", False) and \ (module.get_media_type() == "image" or module.type.startswith("image-downloader")) def process(self): diff --git a/processors/machine_learning/text_from_image.py b/processors/machine_learning/text_from_image.py index f8fa4d645..f47f394ed 100644 --- a/processors/machine_learning/text_from_image.py +++ b/processors/machine_learning/text_from_image.py @@ -87,14 +87,14 @@ class ImageTextDetector(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on image sets :param module: Module to determine compatibility with """ - return config.get('dmi-service-manager.eb_ocr_enabled', False, user=user) and \ - config.get("dmi-service-manager.ab_server_address", False, user=user) and \ + return config.get('dmi-service-manager.eb_ocr_enabled', False) and \ + config.get("dmi-service-manager.ab_server_address", False) and \ (module.get_media_type() == "image" or module.type.startswith("image-downloader")) def process(self): diff --git a/processors/machine_learning/whisper_speech_to_text.py b/processors/machine_learning/whisper_speech_to_text.py index da9f09c66..424eaebab 100644 --- a/processors/machine_learning/whisper_speech_to_text.py +++ b/processors/machine_learning/whisper_speech_to_text.py @@ -57,18 +57,19 @@ class AudioToText(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on audio archives if enabled in Control Panel """ - return config.get("dmi-service-manager.bc_whisper_enabled", False, user=user) and \ - config.get("dmi-service-manager.ab_server_address", False, user=user) and \ + return config.get("dmi-service-manager.bc_whisper_enabled", False) and \ + config.get("dmi-service-manager.ab_server_address", False) and \ (module.get_media_type() == 'audio' or module.type.startswith("audio-extractor")) @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Collect maximum number of audio files from configuration and update options accordingly + :param config: """ options = { "amount": { @@ -114,7 +115,7 @@ def get_options(cls, parent_dataset=None, user=None): } # Update the amount max and help from config - max_number_audio_files = int(config.get("dmi-service-manager.bd_whisper_num_files", 100, user=user)) + max_number_audio_files = int(config.get("dmi-service-manager.bd_whisper_num_files", 100)) if max_number_audio_files == 0: # Unlimited allowed options["amount"]["help"] = "Number of audio files" options["amount"]["default"] = 100 diff --git a/processors/metrics/clarifai_api.py b/processors/metrics/clarifai_api.py index 140a7a1b0..9377a0f38 100644 --- a/processors/metrics/clarifai_api.py +++ b/processors/metrics/clarifai_api.py @@ -42,11 +42,12 @@ class ClarifaiAPIFetcher(BasicProcessor): ] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on image sets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_media_type() == "image" or module.type.startswith("image-downloader") or module.type == "video-frames" diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py index f2c70805a..c83e8d583 100644 --- a/processors/metrics/count_posts.py +++ b/processors/metrics/count_posts.py @@ -38,11 +38,12 @@ class CountPosts(BasicProcessor): } @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @@ -160,7 +161,7 @@ def process(self): self.write_csv_items_and_finish(rows) @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): options = cls.options diff --git a/processors/metrics/debate_metrics.py b/processors/metrics/debate_metrics.py index 06309c090..a9bf64642 100644 --- a/processors/metrics/debate_metrics.py +++ b/processors/metrics/debate_metrics.py @@ -33,11 +33,12 @@ class DebateMetrics(BasicProcessor): followups = [] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor if dataset is a 'top level' dataset :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") diff --git a/processors/metrics/google_vision_api.py b/processors/metrics/google_vision_api.py index 92630e611..2bef79673 100644 --- a/processors/metrics/google_vision_api.py +++ b/processors/metrics/google_vision_api.py @@ -42,11 +42,12 @@ class GoogleVisionAPIFetcher(BasicProcessor): ] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on image sets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_media_type() == "image" or module.type.startswith("image-downloader") or module.type == "video-frames" diff --git a/processors/metrics/hatebase.py b/processors/metrics/hatebase.py index 6a587519f..784c82637 100644 --- a/processors/metrics/hatebase.py +++ b/processors/metrics/hatebase.py @@ -56,11 +56,12 @@ class HatebaseAnalyser(BasicProcessor): } @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @@ -171,21 +172,22 @@ def process(self): self.dataset.finish(processed) @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - This method by default returns the class's "options" attribute, or an - empty dictionary. It can be redefined by processors that need more - fine-grained options, e.g. in cases where the availability of options - is partially determined by the parent dataset's parameters. - - :param DataSet parent_dataset: An object representing the dataset that - the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Get processor options + + This method by default returns the class's "options" attribute, or an + empty dictionary. It can be redefined by processors that need more + fine-grained options, e.g. in cases where the availability of options + is partially determined by the parent dataset's parameters. + + :param config: + :param DataSet parent_dataset: An object representing the dataset that + the processor would be run on + :param User user: Flask user the options will be displayed for, in + case they are requested for display in the 4CAT web interface. This can + be used to show some options only to privileges users. + """ options = cls.options if parent_dataset and parent_dataset.get_columns(): diff --git a/processors/metrics/most_quoted.py b/processors/metrics/most_quoted.py index f0672330b..40c819266 100644 --- a/processors/metrics/most_quoted.py +++ b/processors/metrics/most_quoted.py @@ -26,11 +26,12 @@ class QuoteRanker(BasicProcessor): followups = [] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on chan datasets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") diff --git a/processors/metrics/overtime-hatebase.py b/processors/metrics/overtime-hatebase.py index 48a7e9ce3..58c900f3f 100644 --- a/processors/metrics/overtime-hatebase.py +++ b/processors/metrics/overtime-hatebase.py @@ -35,13 +35,14 @@ class OvertimeHatefulAnalysis(BasicProcessor): ] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on Telegram, Instagram and Reddit datasets Don't quite remember why these three... :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.parameters.get("datasource") in ("telegram", "instagram", "reddit") diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py index 0e38757c6..885b26596 100644 --- a/processors/metrics/rank_attribute.py +++ b/processors/metrics/rank_attribute.py @@ -108,11 +108,12 @@ class AttributeRanker(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on all csv and NDJSON datasets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ("csv", "ndjson") @@ -317,21 +318,22 @@ def extract(self, value, look_for): @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - This method by default returns the class's "options" attribute, or an - empty dictionary. It can be redefined by processors that need more - fine-grained options, e.g. in cases where the availability of options - is partially determined by the parent dataset's parameters. - - :param DataSet parent_dataset: An object representing the dataset that - the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Get processor options + + This method by default returns the class's "options" attribute, or an + empty dictionary. It can be redefined by processors that need more + fine-grained options, e.g. in cases where the availability of options + is partially determined by the parent dataset's parameters. + + :param config: + :param DataSet parent_dataset: An object representing the dataset that + the processor would be run on + :param User user: Flask user the options will be displayed for, in + case they are requested for display in the 4CAT web interface. This can + be used to show some options only to privileges users. + """ options = cls.options if parent_dataset and parent_dataset.get_columns(): diff --git a/processors/metrics/thread_metadata.py b/processors/metrics/thread_metadata.py index 005c1087a..d6c829c71 100644 --- a/processors/metrics/thread_metadata.py +++ b/processors/metrics/thread_metadata.py @@ -26,11 +26,12 @@ class ThreadMetadata(BasicProcessor): followups = [] @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/metrics/top_hatebase.py b/processors/metrics/top_hatebase.py index eaba31693..7632af9d6 100644 --- a/processors/metrics/top_hatebase.py +++ b/processors/metrics/top_hatebase.py @@ -29,11 +29,12 @@ class HatebaseRanker(BasicProcessor): followups = [] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on previous Hatebase analyses :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "hatebase-data" diff --git a/processors/metrics/top_images.py b/processors/metrics/top_images.py index 0f1c619f1..59a591f9e 100644 --- a/processors/metrics/top_images.py +++ b/processors/metrics/top_images.py @@ -32,11 +32,12 @@ class TopImageCounter(BasicProcessor): followups = ["image-downloader"] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ All top-level datasets, excluding Telegram, which has a different image logic :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.type != "telegram-search" and module.get_extension() in ("csv", "ndjson") @@ -175,7 +176,7 @@ def process(self): self.write_csv_items_and_finish(results) @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -183,7 +184,7 @@ def get_options(cls, parent_dataset=None, user=None): only work properly on csv datasets so check the extension before showing it. - :param user: + :param config: :param parent_dataset: Dataset to get options for :return dict: """ diff --git a/processors/metrics/url_titles.py b/processors/metrics/url_titles.py index e32e3538d..4c2cc3716 100644 --- a/processors/metrics/url_titles.py +++ b/processors/metrics/url_titles.py @@ -84,17 +84,18 @@ class URLFetcher(BasicProcessor): } @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -103,10 +104,10 @@ def get_options(cls, parent_dataset=None, user=None): fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters. + :param config: :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can +can be used to show some options only to privileges users. """ options = cls.options diff --git a/processors/metrics/vocabulary_overtime.py b/processors/metrics/vocabulary_overtime.py index 9db239873..2e4820f38 100644 --- a/processors/metrics/vocabulary_overtime.py +++ b/processors/metrics/vocabulary_overtime.py @@ -76,11 +76,12 @@ class OvertimeAnalysis(BasicProcessor): } @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/metrics/youtube_metadata.py b/processors/metrics/youtube_metadata.py index 7ad8fabe3..c7b194fb7 100644 --- a/processors/metrics/youtube_metadata.py +++ b/processors/metrics/youtube_metadata.py @@ -68,11 +68,12 @@ class YouTubeMetadata(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on datasets probably containing youtube links :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ # Compatible with every top-level dataset. return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/networks/clarifai_bipartite_network.py b/processors/networks/clarifai_bipartite_network.py index 97d715a90..f0457880f 100644 --- a/processors/networks/clarifai_bipartite_network.py +++ b/processors/networks/clarifai_bipartite_network.py @@ -36,11 +36,12 @@ class VisionTagBiPartiteNetworker(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on Google Vision API data :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "clarifai-api" diff --git a/processors/networks/colink_urls.py b/processors/networks/colink_urls.py index 46fca2567..4ef2fca4b 100644 --- a/processors/networks/colink_urls.py +++ b/processors/networks/colink_urls.py @@ -52,11 +52,12 @@ class URLCoLinker(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on top datasets. :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py index 236e9577f..9e6d8a57c 100644 --- a/processors/networks/cotag_network.py +++ b/processors/networks/cotag_network.py @@ -33,11 +33,12 @@ class CoTaggerPreset(ProcessorPreset): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on datasets containing a tags column :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ usable_columns = {"tags", "hashtags", "groups"} columns = module.get_columns() diff --git a/processors/networks/coword_network.py b/processors/networks/coword_network.py index 4f37afed7..ee5c8dbc8 100644 --- a/processors/networks/coword_network.py +++ b/processors/networks/coword_network.py @@ -23,11 +23,12 @@ class CowordNetworker(ProcessorPreset): extension = "gexf" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on collocations :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "collocations" diff --git a/processors/networks/google_vision_bipartite_network.py b/processors/networks/google_vision_bipartite_network.py index dbc482850..8d0a0896c 100644 --- a/processors/networks/google_vision_bipartite_network.py +++ b/processors/networks/google_vision_bipartite_network.py @@ -51,11 +51,12 @@ class VisionTagBiPartiteNetworker(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on Google Vision API data :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "google-vision-api" diff --git a/processors/networks/google_vision_network.py b/processors/networks/google_vision_network.py index 0f45e3148..8d271c932 100644 --- a/processors/networks/google_vision_network.py +++ b/processors/networks/google_vision_network.py @@ -50,11 +50,12 @@ class VisionTagNetworker(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on Google Vision API data :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "google-vision-api" diff --git a/processors/networks/hash_similarity_network.py b/processors/networks/hash_similarity_network.py index 554f4388b..781556213 100644 --- a/processors/networks/hash_similarity_network.py +++ b/processors/networks/hash_similarity_network.py @@ -49,9 +49,10 @@ class HashSimilarityNetworker(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Update column option with actual columns + :param config: """ options = cls.options @@ -69,7 +70,7 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Currently only allowed on video-hashes, but technically any row of bit hashes will work. Could check for "hash" in columns, but... how to make that a check as a classmethod? diff --git a/processors/networks/quote_network.py b/processors/networks/quote_network.py index 5b1fc7401..0ad478fa7 100644 --- a/processors/networks/quote_network.py +++ b/processors/networks/quote_network.py @@ -26,11 +26,12 @@ class QuoteNetworkGrapher(BasicProcessor): extension = "gexf" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on chan datasets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py index 0f6045702..d14f37201 100644 --- a/processors/networks/two-column-network.py +++ b/processors/networks/two-column-network.py @@ -88,15 +88,15 @@ class ColumnNetworker(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options These are dynamic for this processor: the 'column names' option is populated with the column names from the parent dataset, if available. + :param config: :param DataSet parent_dataset: Parent dataset - :param user: Flask User to which the options are shown, if applicable :return dict: Processor options """ options = cls.options @@ -123,11 +123,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on all csv and NDJSON datasets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ("csv", "ndjson") diff --git a/processors/networks/user_hashtag_network.py b/processors/networks/user_hashtag_network.py index 318571d91..d457f500b 100644 --- a/processors/networks/user_hashtag_network.py +++ b/processors/networks/user_hashtag_network.py @@ -22,7 +22,7 @@ class HashtagUserBipartiteGrapherPreset(ProcessorPreset): extension = "gexf" # extension of result file, used internally and in UI @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): return { "to-lowercase": { "type": UserInput.OPTION_TOGGLE, @@ -33,11 +33,12 @@ def get_options(cls, parent_dataset=None, user=None): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on datasets containing a tags column :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ usable_columns = {"tags", "hashtags", "groups"} columns = module.get_columns() diff --git a/processors/networks/wikipedia_network.py b/processors/networks/wikipedia_network.py index 0426c97d2..dfe48fefe 100644 --- a/processors/networks/wikipedia_network.py +++ b/processors/networks/wikipedia_network.py @@ -28,11 +28,12 @@ class WikiURLCoLinker(BasicProcessor): extension = "gexf" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on top datasets. :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/presets/annotate-images.py b/processors/presets/annotate-images.py index 7e9164e57..1edb684aa 100644 --- a/processors/presets/annotate-images.py +++ b/processors/presets/annotate-images.py @@ -57,11 +57,12 @@ class AnnotateImages(ProcessorPreset): } @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/presets/monthly-histogram.py b/processors/presets/monthly-histogram.py index 696fc59a2..01fa68e3c 100644 --- a/processors/presets/monthly-histogram.py +++ b/processors/presets/monthly-histogram.py @@ -15,13 +15,14 @@ class MonthlyHistogramCreator(ProcessorPreset): extension = "svg" @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility This preset is compatible with any module that has countable items (via count-posts) :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/presets/neologisms.py b/processors/presets/neologisms.py index 1cf258503..d0a3615d8 100644 --- a/processors/presets/neologisms.py +++ b/processors/presets/neologisms.py @@ -21,10 +21,11 @@ class NeologismExtractor(ProcessorPreset): @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Get processor options + :param config: + """ options = { "timeframe": { "type": UserInput.OPTION_CHOICE, @@ -50,11 +51,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on all csv and NDJSON datasets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ("csv", "ndjson") diff --git a/processors/presets/similar-words.py b/processors/presets/similar-words.py index eed653234..310bbb552 100644 --- a/processors/presets/similar-words.py +++ b/processors/presets/similar-words.py @@ -39,13 +39,14 @@ class SimilarWords(ProcessorPreset): } @staticmethod - def is_compatible_with(module=None, user=None): + def is_compatible_with(module=None, config=None): """ Determine compatibility This preset is compatible with any module that has a "body" column :param Dataset module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") diff --git a/processors/presets/top-hashtags.py b/processors/presets/top-hashtags.py index f6ac6e0e6..1e32b4a1e 100644 --- a/processors/presets/top-hashtags.py +++ b/processors/presets/top-hashtags.py @@ -32,12 +32,12 @@ class TopHashtags(ProcessorPreset): }) @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Check if dataset has a hashtag attribute :param module: Dataset to check - :param user: User trying to run the processor + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ columns = module.get_columns() diff --git a/processors/presets/upload-to-dmi-tcat.py b/processors/presets/upload-to-dmi-tcat.py index a2a459422..b87db19b9 100644 --- a/processors/presets/upload-to-dmi-tcat.py +++ b/processors/presets/upload-to-dmi-tcat.py @@ -17,7 +17,7 @@ class FourcatToDmiTcatConverterAndUploader(ProcessorPreset): extension = "html" @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -25,13 +25,12 @@ def get_options(cls, parent_dataset=None, user=None): TCAT servers are configured. Otherwise, no options are given since there is nothing to choose. + :param config: :param DataSet parent_dataset: Dataset that will be uploaded - :param User user: User that will be uploading it - :return dict: Option definition """ - if config.get('tcat-auto-upload.server_url', user=user) \ - and type(config.get('tcat-auto-upload.server_url', user=user)) in (set, list, tuple) \ - and len(config.get('tcat-auto-upload.server_url', user=user)) > 1: + if config.get('tcat-auto-upload.server_url') \ + and type(config.get('tcat-auto-upload.server_url')) in (set, list, tuple) \ + and len(config.get('tcat-auto-upload.server_url')) > 1: return { "server": { "type": UserInput.OPTION_CHOICE, @@ -52,17 +51,18 @@ def get_options(cls, parent_dataset=None, user=None): return {} @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "twitterv2-search" and \ - config.get('tcat-auto-upload.server_url', user=user) and \ - config.get('tcat-auto-upload.token', user=user) and \ - config.get('tcat-auto-upload.username', user=user) and \ - config.get('tcat-auto-upload.password', user=user) + config.get('tcat-auto-upload.server_url') and \ + config.get('tcat-auto-upload.token') and \ + config.get('tcat-auto-upload.username') and \ + config.get('tcat-auto-upload.password') def get_processor_pipeline(self): """ diff --git a/processors/presets/video-scene-timelines.py b/processors/presets/video-scene-timelines.py index dbb39d4cb..6857af2ec 100644 --- a/processors/presets/video-scene-timelines.py +++ b/processors/presets/video-scene-timelines.py @@ -20,7 +20,7 @@ class VideoSceneTimelineCreator(ProcessorPreset): extension = "svg" @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility @@ -31,7 +31,7 @@ def is_compatible_with(cls, module=None, user=None): :return bool: """ return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path", user=user) and \ + config.get("video-downloader.ffmpeg_path") and \ shutil.which(config.get("video-downloader.ffmpeg_path")) def get_processor_pipeline(self): diff --git a/processors/text-analysis/collocations.py b/processors/text-analysis/collocations.py index fb37c1753..d55a161a9 100644 --- a/processors/text-analysis/collocations.py +++ b/processors/text-analysis/collocations.py @@ -25,11 +25,12 @@ class GetCollocations(BasicProcessor): followups = ["preset-coword-network", "wordcloud"] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "tokenise-posts" diff --git a/processors/text-analysis/documents_per_topic.py b/processors/text-analysis/documents_per_topic.py index 3922fb416..ec8493672 100644 --- a/processors/text-analysis/documents_per_topic.py +++ b/processors/text-analysis/documents_per_topic.py @@ -27,11 +27,12 @@ class TopicModelWordExtractor(BasicProcessor): followups = [] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on topic models :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "topic-modeller" diff --git a/processors/text-analysis/generate_embeddings.py b/processors/text-analysis/generate_embeddings.py index 075d9fdc5..571d65611 100644 --- a/processors/text-analysis/generate_embeddings.py +++ b/processors/text-analysis/generate_embeddings.py @@ -105,11 +105,12 @@ class GenerateWordEmbeddings(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "tokenise-posts" diff --git a/processors/text-analysis/post_topic_matrix.py b/processors/text-analysis/post_topic_matrix.py index ab32b8dcd..d8fdee4b7 100644 --- a/processors/text-analysis/post_topic_matrix.py +++ b/processors/text-analysis/post_topic_matrix.py @@ -44,7 +44,7 @@ class TopicModelWordExtractor(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -53,10 +53,10 @@ def get_options(cls, parent_dataset=None, user=None): fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters. + :param config: :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can +can be used to show some options only to privileges users. """ options = cls.options @@ -73,11 +73,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on topic models :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "topic-modeller" diff --git a/processors/text-analysis/similar_words.py b/processors/text-analysis/similar_words.py index 0bef53b97..a05236f70 100644 --- a/processors/text-analysis/similar_words.py +++ b/processors/text-analysis/similar_words.py @@ -5,7 +5,7 @@ from gensim.models import KeyedVectors -from common.lib.helpers import UserInput, convert_to_int +from common.lib.helpers import UserInput, convert_to_int, convert_to_float from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException @@ -58,11 +58,12 @@ class SimilarWord2VecWords(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on word embedding models :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "generate-embeddings" @@ -83,10 +84,7 @@ def process(self): input_words = input_words.split(",") num_words = convert_to_int(self.parameters.get("num-words", 10)) - try: - threshold = float(self.parameters.get("threshold", 0.25)) - except ValueError: - threshold = float(self.get_options()["threshold"]["default"]) + threshold = convert_to_float(self.parameters.get("threshold", 0.25), 0.25) threshold = max(-1.0, min(1.0, threshold)) diff --git a/processors/text-analysis/split_sentences.py b/processors/text-analysis/split_sentences.py index dd2be7c2f..cd7dff936 100644 --- a/processors/text-analysis/split_sentences.py +++ b/processors/text-analysis/split_sentences.py @@ -25,16 +25,17 @@ class SplitSentences(BasicProcessor): followups = [] @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - :param DataSet parent_dataset: An object representing the dataset that - the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Get processor options + + :param config: + :param DataSet parent_dataset: An object representing the dataset that + the processor would be run on + :param User user: Flask user the options will be displayed for, in + case they are requested for display in the 4CAT web interface. This can + be used to show some options only to privileges users. + """ options = { "column": { "type": UserInput.OPTION_TEXT, @@ -84,11 +85,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on all csv and NDJSON datasets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ("csv", "ndjson") diff --git a/processors/text-analysis/tf_idf.py b/processors/text-analysis/tf_idf.py index 2dea96cdc..7b832b42d 100644 --- a/processors/text-analysis/tf_idf.py +++ b/processors/text-analysis/tf_idf.py @@ -91,11 +91,12 @@ class TfIdf(BasicProcessor): ] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "tokenise-posts" diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index a104306f1..366845c32 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -48,31 +48,33 @@ class Tokenise(BasicProcessor): ] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on all csv and NDJSON datasets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ("csv", "ndjson") @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - This method by default returns the class's "options" attribute, or an - empty dictionary. It can be redefined by processors that need more - fine-grained options, e.g. in cases where the availability of options - is partially determined by the parent dataset's parameters. - - :param DataSet parent_dataset: An object representing the dataset that - the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Get processor options + + This method by default returns the class's "options" attribute, or an + empty dictionary. It can be redefined by processors that need more + fine-grained options, e.g. in cases where the availability of options + is partially determined by the parent dataset's parameters. + + :param config: + :param DataSet parent_dataset: An object representing the dataset that + the processor would be run on + :param User user: Flask user the options will be displayed for, in + case they are requested for display in the 4CAT web interface. This can + be used to show some options only to privileges users. + """ with config.get("PATH_ROOT").joinpath("common/assets/stopwords-languages.json").open() as infile: stopwords = json.load(infile) diff --git a/processors/text-analysis/top_vectors.py b/processors/text-analysis/top_vectors.py index 420129cbb..7848888b7 100644 --- a/processors/text-analysis/top_vectors.py +++ b/processors/text-analysis/top_vectors.py @@ -42,11 +42,12 @@ class VectorRanker(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token vectors :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "vectorise-tokens" diff --git a/processors/text-analysis/topic_modeling.py b/processors/text-analysis/topic_modeling.py index fded4460e..3e7b346c2 100644 --- a/processors/text-analysis/topic_modeling.py +++ b/processors/text-analysis/topic_modeling.py @@ -75,11 +75,12 @@ class TopicModeler(BasicProcessor): ] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "tokenise-posts" diff --git a/processors/text-analysis/topic_words.py b/processors/text-analysis/topic_words.py index f84199861..ba745918d 100644 --- a/processors/text-analysis/topic_words.py +++ b/processors/text-analysis/topic_words.py @@ -38,11 +38,12 @@ class TopicModelWordExtractor(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on topic models :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "topic-modeller" diff --git a/processors/text-analysis/vectorise.py b/processors/text-analysis/vectorise.py index 36d666613..50bcd7209 100644 --- a/processors/text-analysis/vectorise.py +++ b/processors/text-analysis/vectorise.py @@ -26,11 +26,12 @@ class Vectorise(BasicProcessor): followups = ["vector-ranker"] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "tokenise-posts" diff --git a/processors/twitter/aggregate_stats.py b/processors/twitter/aggregate_stats.py index 5fb112ae1..85890ce46 100644 --- a/processors/twitter/aggregate_stats.py +++ b/processors/twitter/aggregate_stats.py @@ -57,11 +57,12 @@ class TwitterAggregatedStats(BasicProcessor): @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] @@ -242,7 +243,7 @@ class TwitterAggregatedStatsVis(TwitterAggregatedStats): ] @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): options = cls.options.copy() options["show_outliers"] = { diff --git a/processors/twitter/base_twitter_stats.py b/processors/twitter/base_twitter_stats.py index 745c6948e..1f1b6de21 100644 --- a/processors/twitter/base_twitter_stats.py +++ b/processors/twitter/base_twitter_stats.py @@ -28,11 +28,12 @@ class TwitterStatsBase(BasicProcessor): @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return False diff --git a/processors/twitter/custom_stats.py b/processors/twitter/custom_stats.py index dc0e8e8f1..73b51372f 100644 --- a/processors/twitter/custom_stats.py +++ b/processors/twitter/custom_stats.py @@ -46,11 +46,12 @@ class TwitterCustomStats(TwitterStatsBase): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] diff --git a/processors/twitter/hashtag_stats.py b/processors/twitter/hashtag_stats.py index 18735ddf4..85639a5e3 100644 --- a/processors/twitter/hashtag_stats.py +++ b/processors/twitter/hashtag_stats.py @@ -42,11 +42,12 @@ class TwitterHashtagStats(TwitterStatsBase): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] diff --git a/processors/twitter/identical_tweets.py b/processors/twitter/identical_tweets.py index 2d505df44..d63a10f05 100644 --- a/processors/twitter/identical_tweets.py +++ b/processors/twitter/identical_tweets.py @@ -33,11 +33,12 @@ class TwitterIdenticalTweets(TwitterStatsBase): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] diff --git a/processors/twitter/mention_export.py b/processors/twitter/mention_export.py index 52027c9c2..d82d2547d 100644 --- a/processors/twitter/mention_export.py +++ b/processors/twitter/mention_export.py @@ -23,11 +23,12 @@ class TwitterMentionsExport(BasicProcessor): extension = "csv" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search"] @@ -150,11 +151,12 @@ class TCATMentionsExport(BasicProcessor): extension = "csv" # extension of result file, used internally and in UI @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager config: Configuration reader (context-aware) """ return module.type in ["dmi-tcat-search"] diff --git a/processors/twitter/source_stats.py b/processors/twitter/source_stats.py index 2a2295088..91704d51e 100644 --- a/processors/twitter/source_stats.py +++ b/processors/twitter/source_stats.py @@ -42,11 +42,12 @@ class TwitterHashtagStats(TwitterStatsBase): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] diff --git a/processors/twitter/twitter_stats.py b/processors/twitter/twitter_stats.py index b55376ca2..6906b892a 100644 --- a/processors/twitter/twitter_stats.py +++ b/processors/twitter/twitter_stats.py @@ -37,11 +37,12 @@ class TwitterStats(TwitterStatsBase): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] diff --git a/processors/twitter/user_stats_individual.py b/processors/twitter/user_stats_individual.py index 025faef43..e29690afd 100644 --- a/processors/twitter/user_stats_individual.py +++ b/processors/twitter/user_stats_individual.py @@ -42,11 +42,12 @@ class TwitterStats(TwitterStatsBase): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] diff --git a/processors/twitter/user_visibility.py b/processors/twitter/user_visibility.py index 73040c385..beed5ffbb 100644 --- a/processors/twitter/user_visibility.py +++ b/processors/twitter/user_visibility.py @@ -35,11 +35,12 @@ class TwitterUserVisibility(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine if processor is compatible with dataset :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["twitterv2-search", "dmi-tcat-search"] diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 99ff5199b..431fb59ec 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -51,7 +51,7 @@ class TelegramImageDownloader(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -59,11 +59,11 @@ def get_options(cls, parent_dataset=None, user=None): TCAT servers are configured. Otherwise, no options are given since there is nothing to choose. + :param config: :param DataSet parent_dataset: Dataset that will be uploaded - :param User user: User that will be uploading it - :return dict: Option definition """ - max_number_images = int(config.get('image-downloader-telegram.max', 1000, user=user)) + + max_number_images = int(config.get('image-downloader-telegram.max', 1000)) return { "amount": { @@ -88,11 +88,12 @@ def get_options(cls, parent_dataset=None, user=None): @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on Telegram datasets with required info :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ if type(module) is DataSet: # we need these to actually instantiate a telegram client and diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py index ef6d44231..592517d5a 100644 --- a/processors/visualisation/download-telegram-videos.py +++ b/processors/visualisation/download-telegram-videos.py @@ -56,7 +56,7 @@ class TelegramVideoDownloader(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -64,11 +64,11 @@ def get_options(cls, parent_dataset=None, user=None): TCAT servers are configured. Otherwise, no options are given since there is nothing to choose. + :param config: :param DataSet parent_dataset: Dataset that will be uploaded - :param User user: User that will be uploading it - :return dict: Option definition """ - max_videos = int(config.get('video-downloader-telegram.max_videos', 100, user=user)) + + max_videos = int(config.get('video-downloader-telegram.max_videos', 100)) return { "amount": { @@ -82,13 +82,14 @@ def get_options(cls, parent_dataset=None, user=None): @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on Telegram datasets with required info :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ - if not config.get("video-downloader-telegram.allow_videos", user=user): + if not config.get("video-downloader-telegram.allow_videos"): return False if type(module) is DataSet: diff --git a/processors/visualisation/download_images.py b/processors/visualisation/download_images.py index c13fd0fca..ac4b484e0 100644 --- a/processors/visualisation/download_images.py +++ b/processors/visualisation/download_images.py @@ -80,25 +80,26 @@ class ImageDownloader(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - This method by default returns the class's "options" attribute, or an - empty dictionary. It can be redefined by processors that need more - fine-grained options, e.g. in cases where the availability of options - is partially determined by the parent dataset's parameters. - - :param DataSet parent_dataset: An object representing the dataset that - the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Get processor options + + This method by default returns the class's "options" attribute, or an + empty dictionary. It can be redefined by processors that need more + fine-grained options, e.g. in cases where the availability of options + is partially determined by the parent dataset's parameters. + + :param config: + :param DataSet parent_dataset: An object representing the dataset that + the processor would be run on + :param User user: Flask user the options will be displayed for, in + case they are requested for display in the 4CAT web interface. This can + be used to show some options only to privileges users. + """ options = cls.options # Update the amount max and help from config - max_number_images = int(config.get('image-downloader.max', 1000, user=user)) + max_number_images = int(config.get('image-downloader.max', 1000)) options['amount']['max'] = max_number_images options['amount']['help'] = "No. of images (max %s)" % max_number_images @@ -122,11 +123,12 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on top image rankings :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return (module.type == "top-images" or module.is_from_collector()) \ and module.type not in ["tiktok-search", "tiktok-urls-search", "telegram-search"] \ diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py index c02b53bf7..9aba1763d 100644 --- a/processors/visualisation/download_tiktok.py +++ b/processors/visualisation/download_tiktok.py @@ -48,7 +48,7 @@ class TikTokVideoDownloader(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -57,27 +57,28 @@ def get_options(cls, parent_dataset=None, user=None): fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters. + :param config: :param DataSet parent_dataset: An object representing the dataset that the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can +can be used to show some options only to privileges users. """ options = cls.options # Update the amount max and help from config - max_number_videos = int(config.get('video-downloader.max', 1000, user=user)) + max_number_videos = int(config.get('video-downloader.max', 1000)) options['amount']['max'] = max_number_videos options['amount']['help'] = f"No. of videos (max {max_number_videos:,})" return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor TikTok datasets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ["tiktok-search", "tiktok-urls-search"] @@ -167,7 +168,7 @@ class TikTokImageDownloader(BasicProcessor): } @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Get processor options @@ -177,26 +178,26 @@ def get_options(cls, parent_dataset=None, user=None): is partially determined by the parent dataset's parameters. :param DataSet parent_dataset: An object representing the dataset that - the processor would be run on - :param User user: Flask user the options will be displayed for, in - case they are requested for display in the 4CAT web interface. This can - be used to show some options only to privileges users. + the processor would be run on can be used to show some options only to + privileged users. + :param ConfigManager config: Configuration reader (context-aware) """ options = cls.options # Update the amount max and help from config - max_number_images = int(config.get("image-downloader.max", 1000, user=user)) + max_number_images = int(config.get("image-downloader.max", 1000)) options['amount']['max'] = max_number_images options['amount']['help'] = f"No. of images (max {max_number_images:,})" return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor TikTok datasets :param module: Dataset or processor to determine compatibility with + :param ConfigManager config: Configuration reader (context-aware) """ return module.type in ["tiktok-search", "tiktok-urls-search"] diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index 2b385ffe7..aa1142b24 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -152,15 +152,16 @@ def __init__(self, logger, job, queue=None, manager=None, modules=None): self.last_post_process_status = None @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): """ Updating columns with actual columns and setting max_number_videos per the max number of images allowed. + :param config: """ options = cls.options # Update the amount max and help from config - max_number_videos = int(config.get('video-downloader.max', 100, user=user)) + max_number_videos = int(config.get('video-downloader.max', 100)) if max_number_videos == 0: options['amount']['help'] = "No. of videos" options["amount"]["tooltip"] = "Use 0 to download all videos" @@ -169,7 +170,7 @@ def get_options(cls, parent_dataset=None, user=None): options['amount']['help'] = f"No. of videos (max {max_number_videos:,}" # And update the max size and help from config - max_video_size = int(config.get('video-downloader.max-size', 100, user=user)) + max_video_size = int(config.get('video-downloader.max-size', 100)) if max_video_size == 0: # Allow video of any size options["max_video_size"]["tooltip"] = "Set to 0 if all sizes are to be downloaded." @@ -205,7 +206,7 @@ def get_options(cls, parent_dataset=None, user=None): # these two options are likely to be unwanted on instances with many # users, so they are behind an admin config options - if config.get("video-downloader.allow-indirect", user=user): + if config.get("video-downloader.allow-indirect"): options["use_yt_dlp"] = { "type": UserInput.OPTION_TOGGLE, "help": "Also attempt to download non-direct video links (such YouTube and other video hosting sites)", @@ -213,7 +214,7 @@ def get_options(cls, parent_dataset=None, user=None): "tooltip": "If False, 4CAT will only download directly linked videos (works with fields like Twitter's \"video\", TikTok's \"video_url\" or Instagram's \"media_url\"), but if True 4CAT uses YT-DLP to download from YouTube and a number of other video hosting sites (see references)." } - if config.get("video-downloader.allow-multiple", user=user): + if config.get("video-downloader.allow-multiple"): options["channel_videos"] = { "type": UserInput.OPTION_TEXT, "help": "Download multiple videos per link? (only works w/ non-direct video links)", @@ -226,7 +227,7 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility @@ -235,6 +236,7 @@ def is_compatible_with(cls, module=None, user=None): dataset anyway. :param module: Module to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return ((module.type.endswith("-search") or module.is_from_collector()) diff --git a/processors/visualisation/histwords.py b/processors/visualisation/histwords.py index f6ae05261..d0807045d 100644 --- a/processors/visualisation/histwords.py +++ b/processors/visualisation/histwords.py @@ -12,7 +12,7 @@ from gensim.models import KeyedVectors from backend.lib.processor import BasicProcessor -from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas +from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas, convert_to_float from common.lib.exceptions import ProcessorInterruptedException from svgwrite.container import SVG @@ -94,11 +94,12 @@ class HistWordsVectorSpaceVisualiser(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "generate-embeddings" @@ -111,11 +112,7 @@ def process(self): return input_words = input_words.split(",") - - try: - threshold = float(self.parameters.get("threshold")) - except ValueError: - threshold = float(self.get_options()["threshold"]["default"]) + threshold = convert_to_float(self.parameters.get("threshold")) threshold = max(-1.0, min(1.0, threshold)) num_words = convert_to_int(self.parameters.get("num-words")) diff --git a/processors/visualisation/image_category_wall.py b/processors/visualisation/image_category_wall.py index d74d28e40..c4e2d3bc9 100644 --- a/processors/visualisation/image_category_wall.py +++ b/processors/visualisation/image_category_wall.py @@ -58,11 +58,12 @@ class ImageCategoryWallGenerator(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on CLIP dataset only :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type.startswith("image-to-categories") or \ module.type.startswith("image-downloader") or \ @@ -71,12 +72,13 @@ def is_compatible_with(cls, module=None, user=None): not module.type not in ["image-downloader-screenshots-search"] @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Collect maximum number of audio files from configuration and update options accordingly - """ - max_number_images = int(config.get("image-visuals.max_per_cat", 1000, user=user)) - max_pixels = int(config.get("image-visuals.max_pixels_per_image", 300, user=user)) + def get_options(cls, parent_dataset=None, config=None): + """ + Collect maximum number of audio files from configuration and update options accordingly + :param config: + """ + max_number_images = int(config.get("image-visuals.max_per_cat", 1000)) + max_pixels = int(config.get("image-visuals.max_pixels_per_image", 300)) options = { "category": { "type": UserInput.OPTION_TEXT, diff --git a/processors/visualisation/image_wall.py b/processors/visualisation/image_wall.py index 17afa79df..5dedcec3d 100644 --- a/processors/visualisation/image_wall.py +++ b/processors/visualisation/image_wall.py @@ -48,17 +48,18 @@ class ImageWallGenerator(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on token sets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_media_type() == "image" or module.type.startswith("image-downloader") or module.type == "video-frames" @classmethod - def get_options(cls, parent_dataset=None, user=None): - max_number_images = int(config.get("image-visuals.max_images", 1000, user=user)) + def get_options(cls, parent_dataset=None, config=None): + max_number_images = int(config.get("image-visuals.max_images", 1000)) options = { "amount": { "type": UserInput.OPTION_TEXT, @@ -128,7 +129,7 @@ def numpy_to_rgb(numpy_array): # 0 = use as many images as in the archive, up to the max if max_images == 0: - max_images = self.get_options()["amount"]["max"] + max_images = self.config.get("image-visuals.max_images", 1000) # we loop through the images twice - once to reduce them to a value # that can be sorted, and another time to actually copy them to the diff --git a/processors/visualisation/image_wall_w_text.py b/processors/visualisation/image_wall_w_text.py index 70d079d1a..4cbeacb45 100644 --- a/processors/visualisation/image_wall_w_text.py +++ b/processors/visualisation/image_wall_w_text.py @@ -40,22 +40,24 @@ class ImageTextWallGenerator(BasicProcessor): combined_dataset = ["image-downloader-stable-diffusion"] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on CLIP dataset only :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ image_dataset, text_dataset = cls.identity_dataset_types(module) return image_dataset is not None and text_dataset is not None @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Collect maximum number of audio files from configuration and update options accordingly - """ - max_number_images = int(config.get("image-visuals.max_per_cat", 1000, user=user)) - max_pixels = int(config.get("image-visuals.max_pixels_per_image", 500, user=user)) + def get_options(cls, parent_dataset=None, config=None): + """ + Collect maximum number of audio files from configuration and update options accordingly + :param config: + """ + max_number_images = int(config.get("image-visuals.max_per_cat", 1000)) + max_pixels = int(config.get("image-visuals.max_pixels_per_image", 500)) options = { "amount": { "type": UserInput.OPTION_TEXT, diff --git a/processors/visualisation/isoviz.py b/processors/visualisation/isoviz.py index 7b0526e80..b74b00f09 100644 --- a/processors/visualisation/isoviz.py +++ b/processors/visualisation/isoviz.py @@ -75,11 +75,12 @@ class IsometricMultigraphRenderer(BasicProcessor): colour_index = 0 @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on rankable items :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ if module.is_dataset(): return module.is_rankable(multiple_items=False) diff --git a/processors/visualisation/rankflow.py b/processors/visualisation/rankflow.py index 37f5ae832..a356b9b10 100644 --- a/processors/visualisation/rankflow.py +++ b/processors/visualisation/rankflow.py @@ -93,11 +93,12 @@ class RankFlowRenderer(BasicProcessor): [0.179, 1.0, 0.475], [0.108, 0.502, 0.914], [0.096, 1.0, 0.502], [0.123, 1.0, 0.69]] @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on rankable items :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_rankable() diff --git a/processors/visualisation/vector_histogram.py b/processors/visualisation/vector_histogram.py index 9ba6c52f0..2a7df433e 100644 --- a/processors/visualisation/vector_histogram.py +++ b/processors/visualisation/vector_histogram.py @@ -39,11 +39,12 @@ class SVGHistogramRenderer(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on rankable items :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.is_rankable(multiple_items=False) diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py index 64b0c4f34..3cc72d083 100644 --- a/processors/visualisation/video_frames.py +++ b/processors/visualisation/video_frames.py @@ -59,12 +59,14 @@ class VideoFrames(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on videos + + :param ConfigManager|None config: Configuration reader (context-aware) """ return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path", user=user) and \ + config.get("video-downloader.ffmpeg_path") and \ shutil.which(config.get("video-downloader.ffmpeg_path")) def process(self): diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index ff1222bc1..e495f50fa 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -37,7 +37,7 @@ class VideoHasherPreset(ProcessorPreset): extension = "gexf" @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): return { "amount": { "type": UserInput.OPTION_TEXT, @@ -66,7 +66,7 @@ def get_options(cls, parent_dataset=None, user=None): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility @@ -77,7 +77,7 @@ def is_compatible_with(cls, module=None, user=None): :return bool: """ return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path", user=user) and \ + config.get("video-downloader.ffmpeg_path") and \ shutil.which(config.get("video-downloader.ffmpeg_path")) def get_processor_pipeline(self): @@ -135,10 +135,11 @@ class VideoHasher(BasicProcessor): followups = ["video-hash-network", "video-hash-similarity-matrix"] @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Options for the processor - """ + def get_options(cls, parent_dataset=None, config=None): + """ + Options for the processor + :param config: + """ options = { "amount": { "type": UserInput.OPTION_TEXT, @@ -160,7 +161,7 @@ def get_options(cls, parent_dataset=None, user=None): return options @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on videos only """ @@ -322,7 +323,7 @@ class VideoHashNetwork(BasicProcessor): ] @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): return {"percent": { "type": UserInput.OPTION_TEXT, "help": "Percent similar", @@ -332,7 +333,7 @@ def get_options(cls, parent_dataset=None, user=None): }} @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on video hasher """ @@ -443,7 +444,7 @@ class VideoHashSimilarities(BasicProcessor): ] @classmethod - def get_options(cls, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): return {"percent": { "type": UserInput.OPTION_TEXT, "help": "Percent similar", @@ -453,7 +454,7 @@ def get_options(cls, parent_dataset=None, user=None): }} @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on video hasher """ diff --git a/processors/visualisation/video_scene_frames.py b/processors/visualisation/video_scene_frames.py index 8edb520ff..a18784825 100644 --- a/processors/visualisation/video_scene_frames.py +++ b/processors/visualisation/video_scene_frames.py @@ -52,7 +52,7 @@ class VideoSceneFrames(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py index 634e8c49d..dfd544c81 100644 --- a/processors/visualisation/video_scene_identifier.py +++ b/processors/visualisation/video_scene_identifier.py @@ -140,7 +140,7 @@ class VideoSceneDetector(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow on videos """ diff --git a/processors/visualisation/video_stack.py b/processors/visualisation/video_stack.py index 87da480b9..2aa7314e1 100644 --- a/processors/visualisation/video_stack.py +++ b/processors/visualisation/video_stack.py @@ -81,11 +81,12 @@ class VideoStack(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility :param DataSet module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ if not (module.get_media_type() == "video" or module.type.startswith("video-downloader")): @@ -94,7 +95,7 @@ def is_compatible_with(cls, module=None, user=None): # Only check these if we have a video dataset # also need ffprobe to determine video lengths # is usually installed in same place as ffmpeg - ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path", user=user)) + ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path")) ffprobe_path = shutil.which("ffprobe".join(ffmpeg_path.rsplit("ffmpeg", 1))) if ffmpeg_path else None return ffmpeg_path and ffprobe_path diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py index f668e6f5e..c01451040 100644 --- a/processors/visualisation/video_timelines.py +++ b/processors/visualisation/video_timelines.py @@ -52,7 +52,7 @@ class VideoTimelines(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility @@ -61,6 +61,7 @@ def is_compatible_with(cls, module=None, user=None): archive. Each folder will be rendered as a separate timeline. :param str module: Module ID to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) :return bool: """ return module.type in ["video-frames", "video-scene-frames"] diff --git a/processors/visualisation/word-cloud.py b/processors/visualisation/word-cloud.py index bc3b92bcc..c9b9af464 100644 --- a/processors/visualisation/word-cloud.py +++ b/processors/visualisation/word-cloud.py @@ -25,16 +25,17 @@ class MakeWordCloud(BasicProcessor): extension = "svg" @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on rankable items :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type in ("tfidf", "collocations", "vector-ranker", "similar-word2vec", "topic-model-words", "extract-nouns", "get-entities") @classmethod - def get_options(self, parent_dataset=None, user=None): + def get_options(cls, parent_dataset=None, config=None): options = {} if not parent_dataset: diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index f7783bcc1..4265db332 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -105,11 +105,12 @@ class MakeWordtree(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor to run on all csv and NDJSON datasets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.get_extension() in ("csv", "ndjson") @@ -163,7 +164,6 @@ def process(self): self.dataset.finish(0) return - window = min(window, self.get_options()["window"]["max"] + 1) window = max(1, window) # determine what tokenisation strategy to use diff --git a/processors/visualisation/youtube_imagewall.py b/processors/visualisation/youtube_imagewall.py index e51f40d5a..3fefca9eb 100644 --- a/processors/visualisation/youtube_imagewall.py +++ b/processors/visualisation/youtube_imagewall.py @@ -47,11 +47,12 @@ class YouTubeImageWall(BasicProcessor): } @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on YouTube thumbnail sets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "youtube-thumbnails" diff --git a/processors/visualisation/youtube_thumbnails.py b/processors/visualisation/youtube_thumbnails.py index 844df8ac6..fce706486 100644 --- a/processors/visualisation/youtube_thumbnails.py +++ b/processors/visualisation/youtube_thumbnails.py @@ -36,11 +36,12 @@ class YouTubeThumbnails(BasicProcessor): sleep_time = 10 @classmethod - def is_compatible_with(cls, module=None, user=None): + def is_compatible_with(cls, module=None, config=None): """ Allow processor on YouTube metadata sets :param module: Dataset or processor to determine compatibility with + :param ConfigManager|None config: Configuration reader (context-aware) """ return module.type == "youtube-metadata" diff --git a/webtool/__init__.py b/webtool/__init__.py index 4c243d7a7..036af7f10 100644 --- a/webtool/__init__.py +++ b/webtool/__init__.py @@ -17,14 +17,14 @@ print("stderr:\n".join([" " + line for line in result.stderr.decode("utf-8").split("\n")])) exit(1) -from flask import Flask -from flask_login import LoginManager +from flask import Flask, request +from flask_login import LoginManager, current_user from flask_limiter import Limiter from flask_limiter.util import get_remote_address from werkzeug.middleware.proxy_fix import ProxyFix from werkzeug import Request -from common.config_manager import config +from common.config_manager import config, ConfigWrapper from common.lib.database import Database from common.lib.logger import Logger from common.lib.queue import JobQueue @@ -36,7 +36,6 @@ # initialize global objects for interacting with all the things login_manager = LoginManager() app = Flask(__name__) -fourcat_modules = ModuleCollector() # this ensures that HTTPS is properly applied to built URLs even if the app # is running behind a proxy @@ -105,6 +104,10 @@ # Set number of form parts to accept (default is 1000; affects number of files that can be uploaded) Request.max_form_parts = config.get("flask.max_form_parts", 1000) +# prepare for views +config = ConfigWrapper(config, user=current_user, request=request) +fourcat_modules = ModuleCollector(config=config) + # import all views import webtool.views.views_admin import webtool.views.views_extensions diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 6ac9272ba..dbd749235 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -12,9 +12,7 @@ from urllib.parse import urlencode, urlparse from webtool import app, config from common.lib.helpers import timify_long -from common.config_manager import ConfigWrapper -from flask import request from flask_login import current_user @app.template_filter('datetime') @@ -372,9 +370,7 @@ def uniqid(): """ return str(uuid.uuid4()) - wrapped_config = ConfigWrapper(config, user=current_user, request=request) - - cv_path = wrapped_config.get("PATH_ROOT").joinpath("config/.current-version") + cv_path = config.get("PATH_ROOT").joinpath("config/.current-version") if cv_path.exists(): with cv_path.open() as infile: version = infile.readline().strip() @@ -383,11 +379,12 @@ def uniqid(): return { - "__has_https": wrapped_config.get("flask.https"), + "__has_https": config.get("flask.https"), "__datenow": datetime.datetime.utcnow(), "__notifications": current_user.get_notifications(), - "__user_config": lambda setting: wrapped_config.get(setting), - "__user_cp_access": any([wrapped_config.get(p) for p in config.config_definition.keys() if p.startswith("privileges.admin")]), + "__user_config": lambda setting: config.get(setting), + "__config": config, + "__user_cp_access": any([config.get(p) for p in config.config_definition.keys() if p.startswith("privileges.admin")]), "__version": version, "uniqid": uniqid } diff --git a/webtool/templates/components/result-child.html b/webtool/templates/components/result-child.html index 36fb64136..4d88a53ce 100644 --- a/webtool/templates/components/result-child.html +++ b/webtool/templates/components/result-child.html @@ -1,5 +1,5 @@ {% set item = child %} -{% set can_process = (current_user.is_authenticated and item.get_available_processors(user=current_user)) %} +{% set can_process = (current_user.is_authenticated and item.get_available_processors(config=__config)) %} {% set deprecated = (item.type not in processors) %} {% set is_filtered = (not deprecated and processors[item.type].is_filter()) %} @@ -143,7 +143,7 @@

{{ processors[item.type].title if not deprecated else "(Deprecated analysis) {# 'More' button to show further analysis and preview #}
- {% if item.is_finished() and (item.children or (item.num_rows > 0 and current_user.is_authenticated and item.get_available_processors(exclude_hidden=True))) %} + {% if item.is_finished() and (item.children or (item.num_rows > 0 and current_user.is_authenticated and item.get_available_processors(exclude_hidden=True, config=__config))) %}