diff --git a/DOCS.md b/DOCS.md index cb6dd82..871b672 100644 --- a/DOCS.md +++ b/DOCS.md @@ -41,7 +41,6 @@ $ pyrepositoryminer analyze [OPTIONS] REPOSITORY [METRICS]:[complexity|filecount * `--commits FILENAME` * `--workers INTEGER`: [default: 1] -* `--global-cache / --no-global-cache`: [default: False] * `--help`: Show this message and exit. ## `pyrepositoryminer branch` diff --git a/poetry.lock b/poetry.lock index 6539ce5..e4f2a38 100644 --- a/poetry.lock +++ b/poetry.lock @@ -455,6 +455,19 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "uvloop" +version = "0.15.2" +description = "Fast implementation of asyncio event loop on top of libuv" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +dev = ["Cython (>=0.29.20,<0.30.0)", "pytest (>=3.6.0)", "Sphinx (>=1.7.3,<1.8.0)", "sphinxcontrib-asyncio (>=0.2.0,<0.3.0)", "sphinx-rtd-theme (>=0.2.4,<0.3.0)", "aiohttp", "flake8 (>=3.8.4,<3.9.0)", "psutil", "pycodestyle (>=2.6.0,<2.7.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"] +docs = ["Sphinx (>=1.7.3,<1.8.0)", "sphinxcontrib-asyncio (>=0.2.0,<0.3.0)", "sphinx-rtd-theme (>=0.2.4,<0.3.0)"] +test = ["aiohttp", "flake8 (>=3.8.4,<3.9.0)", "psutil", "pycodestyle (>=2.6.0,<2.7.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"] + [[package]] name = "virtualenv" version = "20.4.4" @@ -496,7 +509,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pyt [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "3f653767a9d103b55f612013d42937867a5b6769c74250f75ea7b776d8082730" +content-hash = "db93c9da9b03e1bc43517fcf2e05959f3aaf7b0daf0d42465369a162d93649ae" [metadata.files] appdirs = [ @@ -835,6 +848,18 @@ typing-extensions = [ {file = "typing_extensions-3.10.0.0-py3-none-any.whl", hash = "sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"}, {file = "typing_extensions-3.10.0.0.tar.gz", hash = "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342"}, ] +uvloop = [ + {file = "uvloop-0.15.2-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:19fa1d56c91341318ac5d417e7b61c56e9a41183946cc70c411341173de02c69"}, + {file = "uvloop-0.15.2-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:e5e5f855c9bf483ee6cd1eb9a179b740de80cb0ae2988e3fa22309b78e2ea0e7"}, + {file = "uvloop-0.15.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:42eda9f525a208fbc4f7cecd00fa15c57cc57646c76632b3ba2fe005004f051d"}, + {file = "uvloop-0.15.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:90e56f17755e41b425ad19a08c41dc358fa7bf1226c0f8e54d4d02d556f7af7c"}, + {file = "uvloop-0.15.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:7ae39b11a5f4cec1432d706c21ecc62f9e04d116883178b09671aa29c46f7a47"}, + {file = "uvloop-0.15.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b45218c99795803fb8bdbc9435ff7f54e3a591b44cd4c121b02fa83affb61c7c"}, + {file = "uvloop-0.15.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:114543c84e95df1b4ff546e6e3a27521580466a30127f12172a3278172ad68bc"}, + {file = "uvloop-0.15.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44cac8575bf168601424302045234d74e3561fbdbac39b2b54cc1d1d00b70760"}, + {file = "uvloop-0.15.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:6de130d0cb78985a5d080e323b86c5ecaf3af82f4890492c05981707852f983c"}, + {file = "uvloop-0.15.2.tar.gz", hash = "sha256:2bb0624a8a70834e54dde8feed62ed63b50bad7a1265c40d6403a2ac447bce01"}, +] virtualenv = [ {file = "virtualenv-20.4.4-py2.py3-none-any.whl", hash = "sha256:a935126db63128861987a7d5d30e23e8ec045a73840eeccb467c148514e29535"}, {file = "virtualenv-20.4.4.tar.gz", hash = "sha256:09c61377ef072f43568207dc8e46ddeac6bcdcaf288d49011bda0e7f4d38c4a2"}, diff --git a/pyproject.toml b/pyproject.toml index 0ddda51..0fee934 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyrepositoryminer" -version = "0.4.0" +version = "0.5.0" description = "Efficient Repository Mining in Python" license = "GPL-3.0-or-later" authors = ["Fabian Heseding <39628987+fabianhe@users.noreply.github.com>"] @@ -17,6 +17,7 @@ python = "^3.9" pygit2 = "^1.5.0" typer = {extras = ["all"], version = "^0.3.2"} radon = "^4.5.0" +uvloop = "^0.15.2" [tool.poetry.dev-dependencies] pytest = "^5.2" diff --git a/pyrepositoryminer/__init__.py b/pyrepositoryminer/__init__.py index 6a9beea..3d18726 100644 --- a/pyrepositoryminer/__init__.py +++ b/pyrepositoryminer/__init__.py @@ -1 +1 @@ -__version__ = "0.4.0" +__version__ = "0.5.0" diff --git a/pyrepositoryminer/analyze.py b/pyrepositoryminer/analyze.py index c70bc5a..f25898e 100644 --- a/pyrepositoryminer/analyze.py +++ b/pyrepositoryminer/analyze.py @@ -2,6 +2,7 @@ Global variables are accessed in the context of a worker. """ +from json import dumps from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple, TypedDict @@ -15,7 +16,7 @@ tm: Tuple[str, ...] bm: Tuple[str, ...] um: Tuple[str, ...] -cached_oids: Dict[str, bool] +cached_oids: Dict[str, bool] = {} class MetricBase(TypedDict): @@ -41,23 +42,18 @@ class BlobOutput(ObjectOutput): units: List[UnitOutput] -class SignatureOutput(TypedDict): - email: str - name: str - time_offset: int - time: int +SignatureOutput = TypedDict( + "SignatureOutput", {"email": str, "name": str, "time_offset": int, "time": int} +) -class CommitBase(ObjectOutput): +class CommitOutput(ObjectOutput): author: SignatureOutput commit_time: int commit_time_offset: int committer: SignatureOutput message: str parent_ids: List[str] - - -class CommitOutput(CommitBase, total=False): metrics: List[Metric] blobs: List[BlobOutput] @@ -94,28 +90,17 @@ def parse_signature(signature: Signature) -> SignatureOutput: ) -def validate_commit(repository: Repository, commit_id: str) -> bool: - try: - obj = repository.get(commit_id) - except ValueError: - return False - else: - return obj is not None and isinstance(obj, Commit) - - def initialize( repository: Path, - tree_m: Iterable[str], - blob_m: Iterable[str], - unit_m: Iterable[str], - cache: Dict[str, bool], + tree_m: Tuple[str, ...], + blob_m: Tuple[str, ...], + unit_m: Tuple[str, ...], ) -> None: - global repo, tm, bm, um, cached_oids + global repo, tm, bm, um repo = Repository(repository) - tm = tuple(sorted(tree_m)) - bm = tuple(sorted(blob_m)) - um = tuple(sorted(unit_m)) - cached_oids = cache + tm = tree_m + bm = blob_m + um = unit_m def analyze_unit(tree: Tree) -> Iterable[Tuple[str, str, str, Metric]]: @@ -156,7 +141,7 @@ def analyze_tree(tree: Tree) -> Iterable[Metric]: yield metric -def analyze(commit_id: str) -> Optional[CommitOutput]: +def analyze(commit_id: str) -> Optional[str]: global repo try: commit = repo.get(commit_id) @@ -175,7 +160,7 @@ def analyze(commit_id: str) -> Optional[CommitOutput]: d.setdefault(blob_id, {"name": blob_name, "metrics": [], "units": {}})[ "metrics" ].append(metric) - return parse_commit( + output = parse_commit( commit, metrics=analyze_tree(commit.tree), blobs=[ @@ -191,3 +176,4 @@ def analyze(commit_id: str) -> Optional[CommitOutput]: for blob_id, blob in d.items() ], ) + return dumps(output, separators=(",", ":"), indent=None) diff --git a/pyrepositoryminer/main.py b/pyrepositoryminer/main.py index 5aa3690..50fc639 100644 --- a/pyrepositoryminer/main.py +++ b/pyrepositoryminer/main.py @@ -1,7 +1,7 @@ from enum import Enum from itertools import filterfalse, islice -from json import dumps, loads -from multiprocessing import Manager, Pool +from json import loads +from multiprocessing import Pool from pathlib import Path from sys import stdin from typing import Any, Dict, Hashable, Iterable, List, Optional, Set, Tuple, TypeVar @@ -11,7 +11,6 @@ GIT_SORT_REVERSE, GIT_SORT_TIME, GIT_SORT_TOPOLOGICAL, - Commit, Repository, Walker, clone_repository, @@ -36,6 +35,13 @@ class Sort(str, Enum): time = "time" +SORTINGS: Dict[Optional[str], int] = { + "topological": GIT_SORT_TOPOLOGICAL, + "time": GIT_SORT_TIME, + None: GIT_SORT_NONE, +} + + T = TypeVar("T", bound=Hashable) @@ -48,27 +54,30 @@ def iter_distinct(iterable: Iterable[T]) -> Iterable[T]: def validate_metrics( metrics: Optional[List[AvailableMetrics]], -) -> Tuple[Set[str], Set[str], Set[str]]: - if metrics is None: - return (set(), set(), set()) - distinct = {metric.value for metric in metrics} +) -> Tuple[Tuple[str, ...], Tuple[str, ...], Tuple[str, ...]]: + distinct: Set[str] = ( + set() if metrics is None else {metric.value for metric in metrics} + ) return ( - distinct & TreeMetrics.keys(), - distinct & BlobMetrics.keys(), - distinct & UnitMetrics.keys(), + tuple(sorted(distinct & TreeMetrics.keys())), + tuple(sorted(distinct & BlobMetrics.keys())), + tuple(sorted(distinct & UnitMetrics.keys())), ) -def walk_commits( - repo: Repository, branch_name: str, simplify_first_parent: bool -) -> Iterable[Commit]: - branch = repo.branches[branch_name] - walker: Walker = repo.walk( - branch.peel().id, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE +def generate_walkers( + repo: Repository, + branch_names: Iterable[str], + simplify_first_parent: bool, + sorting: int, +) -> Iterable[Walker]: + walkers = tuple( + repo.walk(repo.branches[branch_name.strip()].peel().id, sorting) + for branch_name in branch_names ) - if simplify_first_parent: + for walker in walkers if simplify_first_parent else tuple(): walker.simplify_first_parent() - yield from walker + yield from walkers @app.command() @@ -84,22 +93,15 @@ def commits( limit: Optional[int] = None, ) -> None: """Get the commit ids of a repository.""" - repo = Repository(repository) - sorting = GIT_SORT_NONE - if sort == "topological": - sorting = GIT_SORT_TOPOLOGICAL - elif sort == "time": - sorting = GIT_SORT_TIME - if sort_reverse: - sorting |= GIT_SORT_REVERSE - walkers = [ - repo.walk(repo.branches[branch_name.strip()].peel().id) - for branch_name in (stdin if branches is None else branches) - ] - if simplify_first_parent: - map(lambda walker: walker.simplify_first_parent(), walkers) # type: ignore commit_ids: Iterable[str] = ( - str(commit.id) for walker in walkers for commit in walker + str(commit.id) + for walker in generate_walkers( + Repository(repository), + stdin if branches is None else branches, + simplify_first_parent, + SORTINGS[sort] if not sort_reverse else (SORTINGS[sort] | GIT_SORT_REVERSE), + ) + for commit in walker ) commit_ids = commit_ids if not drop_duplicates else iter_distinct(commit_ids) commit_ids = commit_ids if limit is None else islice(commit_ids, limit) @@ -113,30 +115,15 @@ def analyze( metrics: Optional[List[AvailableMetrics]] = Argument(None, case_sensitive=False), commits: Optional[FileText] = None, workers: int = 1, - global_cache: bool = False, ) -> None: """Analyze commits of a repository.""" - workers = max(workers, 1) - tree_m, blob_m, unit_m = validate_metrics(metrics) - - cache: Dict[str, bool] - if global_cache: - manager = Manager() - cache = manager.dict() - else: - cache = {} + ids = (id.strip() for id in (stdin if commits is None else commits)) with Pool( - max(workers, 1), initialize_worker, (repository, tree_m, blob_m, unit_m, cache) + max(workers, 1), initialize_worker, (repository, *validate_metrics(metrics)) ) as pool: - for result in pool.imap( - analyze_worker, - ( - commit_id.strip() - for commit_id in (stdin if commits is None else commits) - ), - ): - if result is not None: - echo(dumps(result, separators=(",", ":"), indent=None)) + results = (res for res in pool.imap(analyze_worker, ids) if res is not None) + for result in results: + echo(result) @app.command() @@ -152,7 +139,7 @@ def clone( def branch(path: Path, local: bool = True, remote: bool = True) -> None: """Get the branches of a repository.""" repo = Repository(path) - branches: Iterable[str] = tuple() + branches: Iterable[str] if local and remote: branches = repo.branches elif local: diff --git a/pyrepositoryminer/metrics/visitor.py b/pyrepositoryminer/metrics/visitor.py index 9a1317a..910d70a 100644 --- a/pyrepositoryminer/metrics/visitor.py +++ b/pyrepositoryminer/metrics/visitor.py @@ -41,11 +41,11 @@ def analyze_blob(self, blob: VisitableBlob) -> None: @final def visitBlob(self, blob: VisitableBlob) -> TreeVisitor: - if self.is_filtered(blob): - return self - elif self.is_cached(blob): + if self.is_cached(blob): self.handle_cache_hit(blob) return self + elif self.is_filtered(blob): + return self self.cache_blob(blob) self.analyze_blob(blob) return self diff --git a/scripts/profile_analyze.py b/scripts/profile_analyze.py new file mode 100644 index 0000000..0c6e499 --- /dev/null +++ b/scripts/profile_analyze.py @@ -0,0 +1,23 @@ +from cProfile import Profile +from pathlib import Path +from pstats import Stats + +from typer import FileText + +from pyrepositoryminer.main import AvailableMetrics, analyze + + +def main() -> None: + p = Path("/Users/fabian/bare-repos/numpy.git") + with open("commits copy.txt", "rb") as f: + c = FileText(f) + analyze(p, [AvailableMetrics["filecount"]], c) + + +if __name__ == "__main__": + profile = Profile() + profile.enable() + main() + profile.disable() + profile_stats = Stats(profile).sort_stats("tottime") + profile_stats.print_stats() diff --git a/scripts/profile_analyze_worker.py b/scripts/profile_analyze_worker.py new file mode 100644 index 0000000..b0b2249 --- /dev/null +++ b/scripts/profile_analyze_worker.py @@ -0,0 +1,30 @@ +from cProfile import Profile +from pstats import Stats + +from pyrepositoryminer.analyze import analyze + +""" +Set the globals in analyze.py as follows: + +repo: Repository = Repository(".../numpy.git") + +tm: Tuple[str, ...] = tuple([]) +bm: Tuple[str, ...] = tuple(["linecount"]) +um: Tuple[str, ...] = tuple([]) +""" + + +def main() -> None: + with open("commits copy.txt", "r") as f: + commits = tuple(line.strip() for line in f) + profile = Profile() + profile.enable() + for commit in commits[:50]: + analyze(commit) + profile.disable() + profile_stats = Stats(profile).sort_stats("cumtime") + profile_stats.print_stats() + + +if __name__ == "__main__": + main() diff --git a/scripts/profile_subprocess.py b/scripts/profile_subprocess.py new file mode 100644 index 0000000..f2c49e6 --- /dev/null +++ b/scripts/profile_subprocess.py @@ -0,0 +1,44 @@ +import asyncio +from cProfile import Profile +from pstats import Stats +from subprocess import PIPE, Popen + +import uvloop + +DATA = bytes("\n".join(str(i) for i in range(100000)), "utf-8") +ITERATIONS = 2000 + + +async def run_async_wc() -> bytes: + p = await asyncio.create_subprocess_exec( + "wc", "-l", stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE + ) + stdout, stderr = await p.communicate(DATA) + return stdout + + +def run_sync_wc() -> bytes: + p = Popen(["wc", "-l"], stdin=PIPE, stdout=PIPE) + stdout, stderr = p.communicate(DATA) + return stdout + + +def main2() -> None: + uvloop.install() + asyncio.get_event_loop().run_until_complete( + asyncio.gather(*(run_async_wc() for _ in range(ITERATIONS))) + ) + + +def main() -> None: + for _ in range(ITERATIONS): + run_sync_wc() + + +if __name__ == "__main__": + profile = Profile() + profile.enable() + main2() + profile.disable() + profile_stats = Stats(profile).sort_stats("tottime") + profile_stats.print_stats() diff --git a/tests/test_pyrepositoryminer.py b/tests/test_pyrepositoryminer.py index 4363a98..2593bb1 100644 --- a/tests/test_pyrepositoryminer.py +++ b/tests/test_pyrepositoryminer.py @@ -2,4 +2,4 @@ def test_version() -> None: - assert __version__ == "0.4.0" + assert __version__ == "0.5.0"