From 70a8a8a9c3696e7a63aad5a0c11bbc4d8578faa9 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Thu, 23 Jan 2025 00:54:50 +0530 Subject: [PATCH] Remove Scarf tracking (#45865) --- README.md | 3 - RELEASE_NOTES.rst | 1 - .../local_commands/scheduler_command.py | 3 - airflow/config_templates/config.yml | 22 ---- airflow/reproducible_build.yaml | 4 +- airflow/settings.py | 7 - airflow/utils/usage_data_collection.py | 123 ------------------ docs/apache-airflow/faq.rst | 23 ---- .../installation/installing-from-pypi.rst | 6 - tests/core/test_settings.py | 24 ---- tests/utils/test_usage_data_collection.py | 104 --------------- 11 files changed, 2 insertions(+), 318 deletions(-) delete mode 100644 airflow/utils/usage_data_collection.py delete mode 100644 tests/utils/test_usage_data_collection.py diff --git a/README.md b/README.md index 64207dbe70d16..4dcf885a72e8f 100644 --- a/README.md +++ b/README.md @@ -536,6 +536,3 @@ The CI infrastructure for Apache Airflow has been sponsored by: astronomer.io AWS OpenSource - - -Tracking Pixel diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 6bcc06898df7a..09ccb3da1cc41 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -220,7 +220,6 @@ Scarf based telemetry: Airflow now collect telemetry data (#39510) """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Airflow integrates Scarf to collect basic usage data during operation. Deployments can opt-out of data collection by setting the ``[usage_data_collection]enabled`` option to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. -See :ref:`Usage data collection FAQ ` for more information. Datasets no longer trigger inactive DAGs (#38891) """"""""""""""""""""""""""""""""""""""""""""""""" diff --git a/airflow/cli/commands/local_commands/scheduler_command.py b/airflow/cli/commands/local_commands/scheduler_command.py index 05e3de1282e93..35a9ca8703b52 100644 --- a/airflow/cli/commands/local_commands/scheduler_command.py +++ b/airflow/cli/commands/local_commands/scheduler_command.py @@ -32,7 +32,6 @@ from airflow.utils import cli as cli_utils from airflow.utils.providers_configuration_loader import providers_configuration_loaded from airflow.utils.scheduler_health import serve_health_check -from airflow.utils.usage_data_collection import usage_data_collection log = logging.getLogger(__name__) @@ -50,8 +49,6 @@ def scheduler(args: Namespace): """Start Airflow Scheduler.""" print(settings.HEADER) - usage_data_collection() - run_command_with_daemon_option( args=args, process_name="scheduler", diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index b9dbc0bec516d..58254a9d92361 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -2625,28 +2625,6 @@ sensors: type: float example: ~ default: "604800" -usage_data_collection: - description: | - Airflow integrates `Scarf `__ to collect basic platform and usage data - during operation. This data assists Airflow maintainers in better understanding how Airflow is used. - Insights gained from this telemetry are critical for prioritizing patches, minor releases, and - security fixes. Additionally, this information supports key decisions related to the development road map. - Check the FAQ doc for more information on what data is collected. - - Deployments can opt-out of analytics by setting the ``enabled`` option - to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. - Individual users can easily opt-out of analytics in various ways documented in the - `Scarf Do Not Track docs `__. - - options: - enabled: - description: | - Enable or disable usage data collection and sending. - version_added: 2.10.0 - type: boolean - example: ~ - default: "True" - see_also: ":ref:`Usage data collection FAQ `" dag_bundles: description: | Configuration for the DAG bundles. This allows Airflow to load DAGs from different sources. diff --git a/airflow/reproducible_build.yaml b/airflow/reproducible_build.yaml index 7f7f3298ef46f..2a124b78bbf94 100644 --- a/airflow/reproducible_build.yaml +++ b/airflow/reproducible_build.yaml @@ -1,2 +1,2 @@ -release-notes-hash: f1d91d32ade6da6eedd24362610d5f84 -source-date-epoch: 1734354109 +release-notes-hash: ab7a935709e7a13d5587b7eb727ae2bd +source-date-epoch: 1737531923 diff --git a/airflow/settings.py b/airflow/settings.py index afde6d68d7df6..aae6529c8b129 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -652,13 +652,6 @@ def initialize(): atexit.register(dispose_orm) -def is_usage_data_collection_enabled() -> bool: - """Check if data collection is enabled.""" - return conf.getboolean("usage_data_collection", "enabled", fallback=True) and ( - os.getenv("SCARF_ANALYTICS", "").strip().lower() != "false" - ) - - # Const stuff KILOBYTE = 1024 diff --git a/airflow/utils/usage_data_collection.py b/airflow/utils/usage_data_collection.py deleted file mode 100644 index 3bdfb180fa912..0000000000000 --- a/airflow/utils/usage_data_collection.py +++ /dev/null @@ -1,123 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -This module is for management of Airflow's usage data collection. - -This module is not part of the public interface and is subject to change at any time. - -:meta private: -""" - -from __future__ import annotations - -import os -import platform -from urllib.parse import urlencode - -import httpx -from packaging.version import parse - -from airflow import __version__ as airflow_version, settings -from airflow.configuration import conf - - -def usage_data_collection(): - if not settings.is_usage_data_collection_enabled(): - return - - # Exclude pre-releases and dev versions - if _version_is_prerelease(airflow_version): - return - - # Exclude CI environments - if _is_ci_environ(): - return - - scarf_domain = "https://apacheairflow.gateway.scarf.sh/scheduler" - - try: - platform_sys, arch = get_platform_info() - - params = { - "version": airflow_version, - "python_version": get_python_version(), - "platform": platform_sys, - "arch": arch, - "database": get_database_name(), - "db_version": get_database_version(), - "executor": get_executor(), - } - - query_string = urlencode(params) - scarf_url = f"{scarf_domain}?{query_string}" - - httpx.get(scarf_url, timeout=5.0) - except Exception: - pass - - -def _version_is_prerelease(version: str) -> bool: - return parse(version).is_prerelease - - -def _is_ci_environ() -> bool: - """Return True if running in any known CI environment.""" - if os.getenv("CI") == "true": - # Generic CI variable set by many CI systems (GH Actions, Travis, GitLab, CircleCI, Jenkins, Heroku) - return True - - # Other CI variables set by specific CI systems - ci_env_vars = { - "CIRCLECI", # CircleCI - "CODEBUILD_BUILD_ID", # AWS CodeBuild - "GITHUB_ACTIONS", # GitHub Actions - "GITLAB_CI", # GitLab CI - "JENKINS_URL", # Jenkins - "TF_BUILD", # Azure Pipelines - "TRAVIS", # Travis CI - } - - return any(var in os.environ for var in ci_env_vars) - - -def get_platform_info() -> tuple[str, str]: - return platform.system(), platform.machine() - - -def get_database_version() -> str: - if settings.engine is None: - return "None" - - version_info = settings.engine.dialect.server_version_info - # Example: (1, 2, 3) -> "1.2" (cut only major+minor w/o patch) - return ".".join(map(str, version_info[0:2])) if version_info else "None" - - -def get_database_name() -> str: - if settings.engine is None: - return "None" - return settings.engine.dialect.name - - -def get_executor() -> str: - return conf.get("core", "EXECUTOR") - - -def get_python_version() -> str: - # Cut only major+minor from the python version string (e.g. 3.10.12 --> 3.10) - return ".".join(platform.python_version().split(".")[0:2]) diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index f149856d4adeb..be6bee133e75c 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -519,26 +519,3 @@ This means ``explicit_defaults_for_timestamp`` is disabled in your mysql server #. Set ``explicit_defaults_for_timestamp = 1`` under the ``mysqld`` section in your ``my.cnf`` file. #. Restart the Mysql server. - -Does Airflow collect any telemetry data? ----------------------------------------- - -.. _usage-data-collection: - -Airflow integrates `Scarf `__ to collect basic usage data during operation. -This data assists Airflow maintainers in better understanding how Airflow is used. -Insights gained from this data are helpful for prioritizing patches, minor releases, and -security fixes. Additionally, this information supports key decisions related to the development road map. - -Deployments can opt-out of data collection by setting the :ref:`[usage_data_collection] enabled ` -option to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. -Individual users can easily opt-out of analytics in various ways documented in the -`Scarf Do Not Track docs `__. - -The telemetry data collected is limited to the following: - -- Airflow version -- Python version -- Operating system & machine architecture -- Executor -- Metadata DB type & its version diff --git a/docs/apache-airflow/installation/installing-from-pypi.rst b/docs/apache-airflow/installation/installing-from-pypi.rst index 4b2774de6bd93..c9a434f9812bf 100644 --- a/docs/apache-airflow/installation/installing-from-pypi.rst +++ b/docs/apache-airflow/installation/installing-from-pypi.rst @@ -330,12 +330,6 @@ dependencies compatible with just airflow core at the moment Airflow was release # For example: https://raw.githubusercontent.com/apache/airflow/constraints-|version|/constraints-no-providers-3.9.txt pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" - -.. note:: - - Airflow uses `Scarf `__ to collect basic usage data during operation. - Check the :ref:`Usage data collection FAQ ` for more information about the data collected and how to opt-out. - Troubleshooting ''''''''''''''' diff --git a/tests/core/test_settings.py b/tests/core/test_settings.py index 244be2fc9ee16..35de1acce4b9d 100644 --- a/tests/core/test_settings.py +++ b/tests/core/test_settings.py @@ -27,7 +27,6 @@ import pytest from airflow.exceptions import AirflowClusterPolicyViolation, AirflowConfigException -from airflow.settings import is_usage_data_collection_enabled from tests_common.test_utils.config import conf_vars @@ -294,26 +293,3 @@ def test_encoding_absent_in_v2(is_v1, mock_conf): engine_args = settings.prepare_engine_args() assert "encoding" not in engine_args - - -@pytest.mark.parametrize( - "env_var, conf_setting, is_enabled", - [ - ("false", "True", False), # env forces disable - ("false", "False", False), # Both force disable - ("False ", "False", False), # Both force disable - ("true", "True", True), # Both enable - ("true", "False", False), # Conf forces disable - (None, "True", True), # Default env, conf enables - (None, "False", False), # Default env, conf disables - ], -) -def test_usage_data_collection_disabled(env_var, conf_setting, is_enabled): - conf_patch = conf_vars({("usage_data_collection", "enabled"): conf_setting}) - - if env_var is not None: - with conf_patch, patch.dict(os.environ, {"SCARF_ANALYTICS": env_var}): - assert is_usage_data_collection_enabled() == is_enabled - else: - with conf_patch: - assert is_usage_data_collection_enabled() == is_enabled diff --git a/tests/utils/test_usage_data_collection.py b/tests/utils/test_usage_data_collection.py deleted file mode 100644 index 233ff0b76f524..0000000000000 --- a/tests/utils/test_usage_data_collection.py +++ /dev/null @@ -1,104 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -from __future__ import annotations - -import platform -from unittest import mock - -import pytest - -from airflow import __version__ as airflow_version -from airflow.configuration import conf -from airflow.utils.usage_data_collection import ( - get_database_version, - get_python_version, - usage_data_collection, -) - - -@pytest.mark.parametrize("is_enabled, is_prerelease", [(False, True), (True, True)]) -@mock.patch("httpx.get") -def test_scarf_analytics_disabled(mock_get, is_enabled, is_prerelease): - with ( - mock.patch("airflow.settings.is_usage_data_collection_enabled", return_value=is_enabled), - mock.patch("airflow.utils.usage_data_collection._version_is_prerelease", return_value=is_prerelease), - ): - usage_data_collection() - mock_get.assert_not_called() - - -@mock.patch("airflow.settings.is_usage_data_collection_enabled", return_value=True) -@mock.patch("airflow.utils.usage_data_collection._version_is_prerelease", return_value=False) -@mock.patch("airflow.utils.usage_data_collection._is_ci_environ", return_value=False) -@mock.patch("airflow.utils.usage_data_collection.get_database_version", return_value="12.3") -@mock.patch("airflow.utils.usage_data_collection.get_database_name", return_value="postgres") -@mock.patch("httpx.get") -def test_scarf_analytics( - mock_get, - mock_is_usage_data_collection_enabled, - mock_version_is_ci, - mock_version_is_prerelease, - get_database_version, - get_database_name, -): - platform_sys = platform.system() - platform_machine = platform.machine() - python_version = get_python_version() - executor = conf.get("core", "EXECUTOR") - scarf_endpoint = "https://apacheairflow.gateway.scarf.sh/scheduler" - usage_data_collection() - - expected_scarf_url = ( - f"{scarf_endpoint}?version={airflow_version}" - f"&python_version={python_version}" - f"&platform={platform_sys}" - f"&arch={platform_machine}" - f"&database=postgres" - f"&db_version=12.3" - f"&executor={executor}" - ) - - mock_get.assert_called_once_with(expected_scarf_url, timeout=5.0) - - -@pytest.mark.db_test -@pytest.mark.parametrize( - "version_info, expected_version", - [ - ((1, 2, 3), "1.2"), # Normal version tuple - (None, "None"), # No version info available - ((1,), "1"), # Single element version tuple - ((1, 2, 3, "beta", 4), "1.2"), # Complex version tuple with strings - ], -) -def test_get_database_version(version_info, expected_version): - with mock.patch("airflow.settings.engine.dialect.server_version_info", new=version_info): - assert get_database_version() == expected_version - - -@pytest.mark.parametrize( - "version_info, expected_version", - [ - ("1.2.3", "1.2"), # Normal version - ("4", "4"), # Single element version - ("1.2.3.beta4", "1.2"), # Complex version tuple with strings - ], -) -def test_get_python_version(version_info, expected_version): - with mock.patch("platform.python_version", return_value=version_info): - assert get_python_version() == expected_version