From 1770e6ab05d4e360299a1a82d016ea18e4538f85 Mon Sep 17 00:00:00 2001 From: Andrzej Swatowski Date: Mon, 7 Feb 2022 17:04:17 +0100 Subject: [PATCH] Add dp lint command --- CHANGELOG.md | 4 + data_pipelines_cli/cli.py | 2 + data_pipelines_cli/cli_commands/lint.py | 144 +++++++++++++++ data_pipelines_cli/errors.py | 10 ++ docs/conf.py | 2 +- .../data_pipelines_cli.cli_commands.rst | 9 +- setup.py | 8 +- tests/cli_commands/test_lint.py | 166 ++++++++++++++++++ 8 files changed, 341 insertions(+), 4 deletions(-) create mode 100644 data_pipelines_cli/cli_commands/lint.py create mode 100644 tests/cli_commands/test_lint.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 638588e..15c2074 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Added + +- `dp lint` command that uses [SQLFluff](https://github.com/sqlfluff/sqlfluff) to lint models and tests + ## [0.14.0] - 2022-02-02 ## [0.13.0] - 2022-02-01 diff --git a/data_pipelines_cli/cli.py b/data_pipelines_cli/cli.py index 63ce82e..a69c182 100644 --- a/data_pipelines_cli/cli.py +++ b/data_pipelines_cli/cli.py @@ -7,6 +7,7 @@ from .cli_commands.create import create_command from .cli_commands.deploy import deploy_command from .cli_commands.init import init_command +from .cli_commands.lint import lint_command from .cli_commands.prepare_env import prepare_env_command from .cli_commands.publish import publish_command from .cli_commands.run import run_command @@ -36,6 +37,7 @@ def cli() -> None: _cli.add_command(create_command) _cli.add_command(deploy_command) _cli.add_command(init_command) +_cli.add_command(lint_command) _cli.add_command(prepare_env_command) _cli.add_command(publish_command) _cli.add_command(run_command) diff --git a/data_pipelines_cli/cli_commands/lint.py b/data_pipelines_cli/cli_commands/lint.py new file mode 100644 index 0000000..0cef69c --- /dev/null +++ b/data_pipelines_cli/cli_commands/lint.py @@ -0,0 +1,144 @@ +import pathlib +import tempfile +from configparser import ConfigParser +from typing import List + +import click +import yaml + +from ..cli_constants import BUILD_DIR +from ..cli_utils import echo_info, echo_subinfo, echo_warning, subprocess_run +from ..config_generation import ( + generate_profiles_yml, + read_dictionary_from_config_directory, +) +from ..errors import SQLLintError, SubprocessNonZeroExitError + +SQLFLUFF_FIX_NOT_EVERYTHING_ERROR = 1 +SQLFLUFF_LINT_ERROR = 65 # according to `sqlfluff.core.linter.LintingResult.stats` +SQLFLUFF_DIALECT_LOADING_ERROR = 66 # according to `sqlfluff.cli.commands.get_config` + + +def _get_dialect_or_default() -> str: + """Read ``dbt.yml`` config file and return its ``target_type`` or just the ``ansi``.""" + env, dbt_filename = "base", "dbt.yml" + dbt_env_config = read_dictionary_from_config_directory( + BUILD_DIR.joinpath("dag"), env, dbt_filename + ) or read_dictionary_from_config_directory(pathlib.Path.cwd(), env, dbt_filename) + try: + dialect = dbt_env_config["target_type"] + echo_subinfo(f'Found target_type "{dialect}", attempting to use it as the SQL dialect.') + except KeyError: + dialect = "ansi" + echo_warning( + 'Could not find `target_type` in `dbt.yml`. Using the default SQL dialect ("ansi").' + ) + return dialect + + +def _get_source_tests_paths() -> List[pathlib.Path]: + with open(pathlib.Path.cwd().joinpath("dbt_project.yml"), "r") as f: + dbt_project_config = yaml.safe_load(f) + dir_names: List[str] = ( + dbt_project_config.get("source-paths", []) + + dbt_project_config.get("model-paths", []) + + dbt_project_config.get("test-paths", []) + ) + return list(map(lambda dir_name: pathlib.Path.cwd().joinpath(dir_name), dir_names)) + + +def _create_temporary_sqlfluff_config(env: str) -> ConfigParser: + config = ConfigParser() + config["sqlfluff"] = {"templater": "dbt"} + config["sqlfluff:templater:dbt"] = { + "profiles_dir": str(generate_profiles_yml(env, copy_config_dir=True).absolute()) + } + return config + + +def _run_sqlfluff(command: str, dialect: str, env: str, additional_args: List[str]) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_config_path = pathlib.Path(tmp_dir).joinpath("sqlfluff.config") + with open(tmp_config_path, "w") as tmp_config: + _create_temporary_sqlfluff_config(env).write(tmp_config) + + def sqlfluff_args(sql_dialect: str) -> List[str]: + return [ + "sqlfluff", + command, + "--dialect", + sql_dialect, + "--config", + str(tmp_config_path), + *additional_args, + *map(str, _get_source_tests_paths()), + ] + + try: + subprocess_run(sqlfluff_args(dialect)) + except SubprocessNonZeroExitError as err: + if err.exit_code == SQLFLUFF_DIALECT_LOADING_ERROR and dialect != "ansi": + subprocess_run(sqlfluff_args("ansi")) + else: + raise err + + +def _run_fix_sqlfluff(dialect: str, env: str) -> None: + try: + echo_subinfo("Attempting to fix SQLs. Not every error can be automatically fixed.") + _run_sqlfluff("fix", dialect, env, ["--force"]) + except SubprocessNonZeroExitError as err: + if err.exit_code != SQLFLUFF_FIX_NOT_EVERYTHING_ERROR: + raise err + + +def _run_lint_sqlfluff(dialect: str, env: str) -> None: + try: + echo_subinfo("Linting SQLs.") + _run_sqlfluff("lint", dialect, env, []) + except SubprocessNonZeroExitError as err: + if err.exit_code == SQLFLUFF_LINT_ERROR: + raise SQLLintError + else: + raise err + + +def lint(fix: bool, env: str) -> None: + """ + Lint and format SQL. + + :param fix: Whether to lint and fix linting errors, or just lint. + :type fix: bool + :param env: Name of the environment + :type env: str + """ + echo_info("Linting SQLs:") + dialect = _get_dialect_or_default() + if fix: + _run_fix_sqlfluff(dialect, env) + _run_lint_sqlfluff(dialect, env) + + +@click.command( + name="lint", + short_help="Lint and format SQL", + help="Lint and format SQL using SQLFluff.\n\n" + "For more information on rules and the workings of SQLFluff, " + "refer to https://docs.sqlfluff.com/", +) +@click.option( + "--no-fix", + is_flag=True, + default=False, + type=bool, + help="Whether to lint and fix linting errors, or just lint.", +) +@click.option( + "--env", + default="local", + type=str, + show_default=True, + help="Name of the environment", +) +def lint_command(no_fix: bool, env: str) -> None: + lint(not no_fix, env) diff --git a/data_pipelines_cli/errors.py b/data_pipelines_cli/errors.py index ac31fa7..f699a54 100644 --- a/data_pipelines_cli/errors.py +++ b/data_pipelines_cli/errors.py @@ -38,8 +38,11 @@ def __init__(self, project_path: str) -> None: class SubprocessNonZeroExitError(DataPipelinesError): """Exception raised if subprocess exits with non-zero exit code""" + exit_code: int + def __init__(self, subprocess_name: str, exit_code: int) -> None: self.message = f"{subprocess_name} has exited with non-zero exit code: {exit_code}" + self.exit_code = exit_code class SubprocessNotFound(DataPipelinesError): @@ -80,3 +83,10 @@ class DockerErrorResponseError(DataPipelinesError): def __init__(self, error_msg: str) -> None: self.message = "Error raised when using Docker.\n" + error_msg + + +class SQLLintError(DataPipelinesError): + """Exception raised if there are linting problems in some files.""" + + def __init__(self) -> None: + self.message = "Fix SQL linting errors." diff --git a/docs/conf.py b/docs/conf.py index 9fbcf77..435b00d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,7 +9,7 @@ import os import sys -sys.path.insert(0, os.path.abspath("../data_pipelines_cli")) +sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- diff --git a/docs/source/data_pipelines_cli.cli_commands.rst b/docs/source/data_pipelines_cli.cli_commands.rst index 0163395..300b66c 100644 --- a/docs/source/data_pipelines_cli.cli_commands.rst +++ b/docs/source/data_pipelines_cli.cli_commands.rst @@ -49,6 +49,14 @@ data\_pipelines\_cli.cli\_commands.init module :undoc-members: :show-inheritance: +data\_pipelines\_cli.cli\_commands.lint module +---------------------------------------------- + +.. automodule:: data_pipelines_cli.cli_commands.lint + :members: + :undoc-members: + :show-inheritance: + data\_pipelines\_cli.cli\_commands.prepare\_env module ------------------------------------------------------ @@ -96,4 +104,3 @@ data\_pipelines\_cli.cli\_commands.update module :members: :undoc-members: :show-inheritance: - diff --git a/setup.py b/setup.py index 6fcbcac..1c7d97c 100644 --- a/setup.py +++ b/setup.py @@ -10,10 +10,10 @@ "questionary==1.10.0", "pyyaml>=5.1, <6.0", "types-PyYAML>=6.0", - "copier==5.1.0", - "dbt>=0.21, <0.22", + "dbt==0.21.0", "Jinja2>=2.11,<2.12", "fsspec", + "copier==5.1.0", ] EXTRA_FILESYSTEMS_REQUIRE = { @@ -25,6 +25,10 @@ "docker": ["docker>=5.0"], "datahub": ["acryl-datahub>=0.8.17, <0.8.18"], "git": ["GitPython==3.1.26"], + "lint": [ + "sqlfluff==0.9.0", + "sqlfluff-templater-dbt==0.9.0", + ], "tests": [ "pytest>=6.2.2, <7.0.0", "pytest-cov>=2.8.0, <3.0.0", diff --git a/tests/cli_commands/test_lint.py b/tests/cli_commands/test_lint.py new file mode 100644 index 0000000..f723535 --- /dev/null +++ b/tests/cli_commands/test_lint.py @@ -0,0 +1,166 @@ +import pathlib +import shutil +import tempfile +import unittest +from typing import List +from unittest.mock import MagicMock, patch + +import yaml +from click.testing import CliRunner + +from data_pipelines_cli.cli import _cli +from data_pipelines_cli.cli_commands.lint import ( + _get_dialect_or_default, + _get_source_tests_paths, +) +from data_pipelines_cli.errors import SQLLintError, SubprocessNonZeroExitError + +goldens_dir_path = pathlib.Path(__file__).parent.parent.joinpath("goldens") + + +@patch("data_pipelines_cli.cli_commands.lint.generate_profiles_yml", MagicMock()) +class LintCommandTestCase(unittest.TestCase): + def setUp(self) -> None: + self.linted_sqls = [] + self.subprocess_run_args = [] + + self.dbt_project_tmp_dir = pathlib.Path(tempfile.mkdtemp()) + shutil.copyfile( + goldens_dir_path.joinpath("dbt_project.yml"), + self.dbt_project_tmp_dir.joinpath("dbt_project.yml"), + ) + + def tearDown(self) -> None: + shutil.rmtree(self.dbt_project_tmp_dir) + + def _mock_run(self, args: List[str]): + self.subprocess_run_args.append(args) + + def _mock_run_raise_error(self, args: List[str]): + self.subprocess_run_args.append(args) + raise SubprocessNonZeroExitError("sqlfluff", 65) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_lint_sqls_with_errors(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run_raise_error + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, SQLLintError) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertFalse(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_lint_sqls_without_errors(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 0, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertFalse(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_fix_sqls(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint"]) + self.assertEqual( + 0, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertTrue(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_unexpected_error(self, subprocess_mock): + for err in [ + ConnectionAbortedError, + FileNotFoundError, + FileExistsError, + KeyError, + FloatingPointError, + ]: + with self.subTest(exception=err), patch( + "pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir) + ): + subprocess_mock.side_effect = err + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, err) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_different_subprocess_error(self, subprocess_mock): + subprocess_mock.side_effect = SubprocessNonZeroExitError("sqlfluff", 248) + + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, SubprocessNonZeroExitError) + self.assertEqual(248, result.exception.exit_code) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_wrong_dialect_error(self, subprocess_mock): + subprocess_mock.side_effect = SubprocessNonZeroExitError("sqlfluff", 66) + + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint._get_dialect_or_default", lambda: "some_dialect" + ): + runner = CliRunner() + runner.invoke(_cli, ["lint"]) + self.assertEqual(2, subprocess_mock.call_count) + + +class LintHelperFunctionsTestCase(unittest.TestCase): + def test_get_dialect(self): + build_dir_mock = MagicMock() + build_dir_mock.configure_mock(**{"joinpath": lambda _self, *_args: goldens_dir_path}) + + with patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", build_dir_mock): + self.assertEqual("bigquery", _get_dialect_or_default()) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("pathlib.Path.cwd", lambda: goldens_dir_path) + def test_get_dialect_no_build_dir(self): + self.assertEqual("bigquery", _get_dialect_or_default()) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("pathlib.Path.cwd", lambda: pathlib.Path("/a/b/c/d/e/f")) + def test_default_dialect(self): + self.assertEqual("ansi", _get_dialect_or_default()) + + def test_get_source_tests_paths(self): + with tempfile.TemporaryDirectory() as tmp_dir, patch( + "pathlib.Path.cwd", lambda: pathlib.Path(tmp_dir) + ): + with open(goldens_dir_path.joinpath("dbt_project.yml"), "r") as orig_dbt, open( + pathlib.Path(tmp_dir).joinpath("dbt_project.yml"), "w" + ) as tmp_dbt: + dbt_project = yaml.safe_load(orig_dbt) + dbt_project["source-paths"] = ["models", "models2", "models3"] + yaml.dump(dbt_project, tmp_dbt) + + self.assertSetEqual( + { + pathlib.Path(tmp_dir).joinpath(dir_name) + for dir_name in ["models", "models2", "models3", "tests"] + }, + set(_get_source_tests_paths()), + )