bellingcat · pjrobertson · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,3 @@
+[MAIN]
+
+ignore-patterns=(.*tests.*.py, __manifest__.py)
diff --git a/README.md b/README.md
@@ -218,7 +218,7 @@ configurations:
 ## Running on Google Sheets Feeder (gsheet_feeder)
 The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. 
 This sheet must have been shared with the Google Service account used by `gspread`. 
-This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is:
+This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is:
 
 Inputs:
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,6 @@ dependencies = [
     "pdqhash (>=0.0.0)",
     "pillow (>=0.0.0)",
     "python-slugify (>=0.0.0)",
-    "pyyaml (>=0.0.0)",
     "dateparser (>=0.0.0)",
     "python-twitter-v2 (>=0.0.0)",
     "instaloader (>=0.0.0)",
@@ -47,7 +46,7 @@ dependencies = [
     "cryptography (>=41.0.0,<42.0.0)",
     "boto3 (>=1.28.0,<2.0.0)",
     "dataclasses-json (>=0.0.0)",
-    "yt-dlp (==2025.1.12)",
+    "yt-dlp (>=2025.1.26,<2026.0.0)",
     "numpy (==2.1.3)",
     "vk-url-scraper (>=0.0.0)",
     "requests[socks] (>=0.0.0)",
@@ -57,11 +56,14 @@ dependencies = [
     "retrying (>=0.0.0)",
     "tsp-client (>=0.0.0)",
     "certvalidator (>=0.0.0)",
+    "rich-argparse (>=1.6.0,<2.0.0)",
+    "ruamel-yaml (>=0.18.10,<0.19.0)",
 ]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.4"
 autopep8 = "^2.3.1"
+pytest-loguru = "^0.4.0"
 
 [tool.poetry.group.docs.dependencies]
 sphinx = "^8.1.3"

diff --git a/scripts/create_update_gdrive_oauth_token.py b/scripts/create_update_gdrive_oauth_token.py
@@ -12,7 +12,7 @@
 # Code below from https://developers.google.com/drive/api/quickstart/python
 # Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json
 
-SCOPES = ['https://www.googleapis.com/auth/drive']
+SCOPES = ["https://www.googleapis.com/auth/drive.file"]
 
 
 @click.command(
@@ -23,67 +23,70 @@
     "-c",
     type=click.Path(exists=True),
     help="path to the credentials.json file downloaded from https://console.cloud.google.com/apis/credentials",
-    required=True
+    required=True,
 )
 @click.option(
     "--token",
     "-t",
     type=click.Path(exists=False),
     default="gd-token.json",
     help="file where to place the OAuth token, defaults to gd-token.json which you must then move to where your orchestration file points to, defaults to gd-token.json",
-    required=True
+    required=True,
 )
 def main(credentials, token):
     # The file token.json stores the user's access and refresh tokens, and is
     # created automatically when the authorization flow completes for the first time.
     creds = None
     if os.path.exists(token):
-        with open(token, 'r') as stream:
+        with open(token, "r") as stream:
             creds_json = json.load(stream)
             # creds = Credentials.from_authorized_user_file(creds_json, SCOPES)
-            creds_json['refresh_token'] = creds_json.get("refresh_token", "")
+            creds_json["refresh_token"] = creds_json.get("refresh_token", "")
             creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
 
     # If there are no (valid) credentials available, let the user log in.
     if not creds or not creds.valid:
         if creds and creds.expired and creds.refresh_token:
-            print('Requesting new token')
+            print("Requesting new token")
             creds.refresh(Request())
         else:
-            print('First run through so putting up login dialog')
+            print("First run through so putting up login dialog")
             # credentials.json downloaded from https://console.cloud.google.com/apis/credentials
             flow = InstalledAppFlow.from_client_secrets_file(credentials, SCOPES)
             creds = flow.run_local_server(port=55192)
         # Save the credentials for the next run
-        with open(token, 'w') as token:
-            print('Saving new token')
+        with open(token, "w") as token:
+            print("Saving new token")
             token.write(creds.to_json())
     else:
-        print('Token valid')
+        print("Token valid")
 
     try:
-        service = build('drive', 'v3', credentials=creds)
+        service = build("drive", "v3", credentials=creds)
 
         # About the user
         results = service.about().get(fields="*").execute()
-        emailAddress = results['user']['emailAddress']
+        emailAddress = results["user"]["emailAddress"]
         print(emailAddress)
 
         # Call the Drive v3 API and return some files
-        results = service.files().list(
-            pageSize=10, fields="nextPageToken, files(id, name)").execute()
-        items = results.get('files', [])
+        results = (
+            service.files()
+            .list(pageSize=10, fields="nextPageToken, files(id, name)")
+            .execute()
+        )
+        items = results.get("files", [])
 
         if not items:
-            print('No files found.')
+            print("No files found.")
             return
-        print('Files:')
+        print("Files:")
         for item in items:
-            print(u'{0} ({1})'.format(item['name'], item['id']))
+            print("{0} ({1})".format(item["name"], item["id"]))
 
     except HttpError as error:
-        print(f'An error occurred: {error}')
+        print(f"An error occurred: {error}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/scripts/telegram_setup.py b/scripts/telegram_setup.py
@@ -0,0 +1,29 @@
+"""
+This script is used to create a new session file for the Telegram client.
+To do this you must first create a Telegram application at https://my.telegram.org/apps
+And store your id and hash in the environment variables TELEGRAM_API_ID and TELEGRAM_API_HASH.
+Create a .env file, or add the following to your environment :
+```
+export TELEGRAM_API_ID=[YOUR_ID_HERE]
+export TELEGRAM_API_HASH=[YOUR_HASH_HERE]
+```
+Then run this script to create a new session file.
+
+You will need to provide your phone number and a 2FA code the first time you run this script.
+"""
+
+
+import os
+from telethon.sync import TelegramClient
+from loguru import logger
+
+
+# Create a
+API_ID = os.getenv("TELEGRAM_API_ID")
+API_HASH = os.getenv("TELEGRAM_API_HASH")
+SESSION_FILE = "secrets/anon-insta"
+
+os.makedirs("secrets", exist_ok=True)
+with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
+    logger.success(f"New session file created: {SESSION_FILE}.session")
+
diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py
diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py
@@ -1,13 +1,9 @@
 """ Entry point for the auto_archiver package. """
-from . import Config
-from . import ArchivingOrchestrator
+from auto_archiver.core.orchestrator import ArchivingOrchestrator
+import sys
 
 def main():
-    config = Config()
-    config.parse()
-    orchestrator = ArchivingOrchestrator(config)
-    for r in orchestrator.feed(): pass
-
+    ArchivingOrchestrator().run(sys.argv[1:])
 
 if __name__ == "__main__":
     main()
diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py
diff --git a/src/auto_archiver/archivers/generic_archiver/__init__.py b/src/auto_archiver/archivers/generic_archiver/__init__.py
diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py
diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py
@@ -3,9 +3,15 @@
 """
 from .metadata import Metadata
 from .media import Media
-from .step import Step
-from .context import ArchivingContext
+from .module import BaseModule
 
 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
-# from .config import Config
+# from .config import Config
+
+from .database import Database
+from .enricher import Enricher
+from .feeder import Feeder
+from .storage import Storage
+from .extractor import Extractor
+from .formatter import Formatter
diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py
@@ -0,0 +1,142 @@
+
+from urllib.parse import urlparse
+from typing import  Mapping, Any
+from abc import ABC
+from copy import deepcopy, copy
+from tempfile import TemporaryDirectory
+from auto_archiver.utils import url as UrlUtil
+
+from loguru import logger
+
+class BaseModule(ABC):
+
+    """
+    Base module class. All modules should inherit from this class.
+
+    The exact methods a class implements will depend on the type of module it is,
+    however all modules have a .setup(config: dict) method to run any setup code
+    (e.g. logging in to a site, spinning up a browser etc.)
+
+    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
+    a subclass can be of multiple types. For example, a module that extracts data from
+    a website and stores it in a database would be both an 'extractor' and a 'database' module.
+
+    Each module is a python package, and should have a __manifest__.py file in the
+    same directory as the module file. The __manifest__.py specifies the module information
+    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
+    default manifest structure.
+
+    """
+
+    MODULE_TYPES = [
+        'feeder',
+        'extractor',
+        'enricher',
+        'database',
+        'storage',
+        'formatter'
+    ]
+
+    _DEFAULT_MANIFEST = {
+    'name': '', # the display name of the module
+    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
+    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
+    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
+    'description': '', # a description of the module
+    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
+    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
+    'version': '1.0', # the version of the module
+    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
+}
+
+    config: Mapping[str, Any]
+    authentication: Mapping[str, Mapping[str, str]]
+    name: str
+
+    # this is set by the orchestrator prior to archiving
+    tmp_dir: TemporaryDirectory = None
+
+    @property
+    def storages(self) -> list:
+        return self.config.get('storages', [])
+
+    def setup(self, config: dict):
+
+        authentication = config.get('authentication', {})
+        # extract out concatenated sites
+        for key, val in copy(authentication).items():
+            if "," in key:
+                for site in key.split(","):
+                    authentication[site] = val
+                del authentication[key]
+
+        # this is important. Each instance is given its own deepcopied config, so modules cannot
+        # change values to affect other modules
+        config = deepcopy(config)
+        authentication = deepcopy(config.pop('authentication', {}))
+
+        self.authentication = authentication
+        self.config = config
+        for key, val in config.get(self.name, {}).items():
+            setattr(self, key, val)
+
+    def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
+        """
+        Returns the authentication information for a given site. This is used to authenticate
+        with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
+
+        extract_cookies: bool - whether or not to extract cookies from the given browser and return the 
+        cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
+
+        Currently, the dict can have keys of the following types:
+        - username: str - the username to use for login
+        - password: str - the password to use for login
+        - api_key: str - the API key to use for login
+        - api_secret: str - the API secret to use for login
+        - cookie: str - a cookie string to use for login (specific to this site)
+        - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
+        """
+        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
+        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
+
+        site = UrlUtil.domain_for_url(site)
+        # add the 'www' version of the site to the list of sites to check
+        authdict = {}
+
+
+        for to_try in [site, f"www.{site}"]:
+            if to_try in self.authentication:
+                authdict.update(self.authentication[to_try])
+                break
+
+        # do a fuzzy string match just to print a warning - don't use it since it's insecure
+        if not authdict:
+            for key in self.authentication.keys():
+                if key in site or site in key:
+                    logger.debug(f"Could not find exact authentication information for site '{site}'. \
+                                    did find information for '{key}' which is close, is this what you meant? \
+                                    If so, edit your authentication settings to make sure it exactly matches.")
+
+        def get_ytdlp_cookiejar(args):
+            import yt_dlp
+            from yt_dlp import parse_options
+            logger.debug(f"Extracting cookies from settings: {args[1]}")
+            # parse_options returns a named tuple as follows, we only need the ydl_options part
+            # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
+            ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
+            return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
+
+        # get the cookies jar, prefer the browser cookies than the file
+        if 'cookies_from_browser' in self.authentication:
+            authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
+            if extract_cookies:
+                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
+        elif 'cookies_file' in self.authentication:
+            authdict['cookies_file'] = self.authentication['cookies_file']
+            if extract_cookies:
+                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
+
+        return authdict
+
+    def repr(self):
+        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		[MAIN]

		ignore-patterns=(.tests..py, __manifest__.py)