Skip to content

Commit

Permalink
Merge pull request #32 from fabianhe/dev
Browse files Browse the repository at this point in the history
Release 0.5.0
  • Loading branch information
fabianhe authored Jun 9, 2021
2 parents 971f058 + 091e656 commit fb3e812
Show file tree
Hide file tree
Showing 11 changed files with 187 additions and 92 deletions.
1 change: 0 additions & 1 deletion DOCS.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ $ pyrepositoryminer analyze [OPTIONS] REPOSITORY [METRICS]:[complexity|filecount

* `--commits FILENAME`
* `--workers INTEGER`: [default: 1]
* `--global-cache / --no-global-cache`: [default: False]
* `--help`: Show this message and exit.

## `pyrepositoryminer branch`
Expand Down
27 changes: 26 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pyrepositoryminer"
version = "0.4.0"
version = "0.5.0"
description = "Efficient Repository Mining in Python"
license = "GPL-3.0-or-later"
authors = ["Fabian Heseding <[email protected]>"]
Expand All @@ -17,6 +17,7 @@ python = "^3.9"
pygit2 = "^1.5.0"
typer = {extras = ["all"], version = "^0.3.2"}
radon = "^4.5.0"
uvloop = "^0.15.2"

[tool.poetry.dev-dependencies]
pytest = "^5.2"
Expand Down
2 changes: 1 addition & 1 deletion pyrepositoryminer/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.0"
__version__ = "0.5.0"
46 changes: 16 additions & 30 deletions pyrepositoryminer/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Global variables are accessed in the context of a worker.
"""
from json import dumps
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, TypedDict

Expand All @@ -15,7 +16,7 @@
tm: Tuple[str, ...]
bm: Tuple[str, ...]
um: Tuple[str, ...]
cached_oids: Dict[str, bool]
cached_oids: Dict[str, bool] = {}


class MetricBase(TypedDict):
Expand All @@ -41,23 +42,18 @@ class BlobOutput(ObjectOutput):
units: List[UnitOutput]


class SignatureOutput(TypedDict):
email: str
name: str
time_offset: int
time: int
SignatureOutput = TypedDict(
"SignatureOutput", {"email": str, "name": str, "time_offset": int, "time": int}
)


class CommitBase(ObjectOutput):
class CommitOutput(ObjectOutput):
author: SignatureOutput
commit_time: int
commit_time_offset: int
committer: SignatureOutput
message: str
parent_ids: List[str]


class CommitOutput(CommitBase, total=False):
metrics: List[Metric]
blobs: List[BlobOutput]

Expand Down Expand Up @@ -94,28 +90,17 @@ def parse_signature(signature: Signature) -> SignatureOutput:
)


def validate_commit(repository: Repository, commit_id: str) -> bool:
try:
obj = repository.get(commit_id)
except ValueError:
return False
else:
return obj is not None and isinstance(obj, Commit)


def initialize(
repository: Path,
tree_m: Iterable[str],
blob_m: Iterable[str],
unit_m: Iterable[str],
cache: Dict[str, bool],
tree_m: Tuple[str, ...],
blob_m: Tuple[str, ...],
unit_m: Tuple[str, ...],
) -> None:
global repo, tm, bm, um, cached_oids
global repo, tm, bm, um
repo = Repository(repository)
tm = tuple(sorted(tree_m))
bm = tuple(sorted(blob_m))
um = tuple(sorted(unit_m))
cached_oids = cache
tm = tree_m
bm = blob_m
um = unit_m


def analyze_unit(tree: Tree) -> Iterable[Tuple[str, str, str, Metric]]:
Expand Down Expand Up @@ -156,7 +141,7 @@ def analyze_tree(tree: Tree) -> Iterable[Metric]:
yield metric


def analyze(commit_id: str) -> Optional[CommitOutput]:
def analyze(commit_id: str) -> Optional[str]:
global repo
try:
commit = repo.get(commit_id)
Expand All @@ -175,7 +160,7 @@ def analyze(commit_id: str) -> Optional[CommitOutput]:
d.setdefault(blob_id, {"name": blob_name, "metrics": [], "units": {}})[
"metrics"
].append(metric)
return parse_commit(
output = parse_commit(
commit,
metrics=analyze_tree(commit.tree),
blobs=[
Expand All @@ -191,3 +176,4 @@ def analyze(commit_id: str) -> Optional[CommitOutput]:
for blob_id, blob in d.items()
],
)
return dumps(output, separators=(",", ":"), indent=None)
95 changes: 41 additions & 54 deletions pyrepositoryminer/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import Enum
from itertools import filterfalse, islice
from json import dumps, loads
from multiprocessing import Manager, Pool
from json import loads
from multiprocessing import Pool
from pathlib import Path
from sys import stdin
from typing import Any, Dict, Hashable, Iterable, List, Optional, Set, Tuple, TypeVar
Expand All @@ -11,7 +11,6 @@
GIT_SORT_REVERSE,
GIT_SORT_TIME,
GIT_SORT_TOPOLOGICAL,
Commit,
Repository,
Walker,
clone_repository,
Expand All @@ -36,6 +35,13 @@ class Sort(str, Enum):
time = "time"


SORTINGS: Dict[Optional[str], int] = {
"topological": GIT_SORT_TOPOLOGICAL,
"time": GIT_SORT_TIME,
None: GIT_SORT_NONE,
}


T = TypeVar("T", bound=Hashable)


Expand All @@ -48,27 +54,30 @@ def iter_distinct(iterable: Iterable[T]) -> Iterable[T]:

def validate_metrics(
metrics: Optional[List[AvailableMetrics]],
) -> Tuple[Set[str], Set[str], Set[str]]:
if metrics is None:
return (set(), set(), set())
distinct = {metric.value for metric in metrics}
) -> Tuple[Tuple[str, ...], Tuple[str, ...], Tuple[str, ...]]:
distinct: Set[str] = (
set() if metrics is None else {metric.value for metric in metrics}
)
return (
distinct & TreeMetrics.keys(),
distinct & BlobMetrics.keys(),
distinct & UnitMetrics.keys(),
tuple(sorted(distinct & TreeMetrics.keys())),
tuple(sorted(distinct & BlobMetrics.keys())),
tuple(sorted(distinct & UnitMetrics.keys())),
)


def walk_commits(
repo: Repository, branch_name: str, simplify_first_parent: bool
) -> Iterable[Commit]:
branch = repo.branches[branch_name]
walker: Walker = repo.walk(
branch.peel().id, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE
def generate_walkers(
repo: Repository,
branch_names: Iterable[str],
simplify_first_parent: bool,
sorting: int,
) -> Iterable[Walker]:
walkers = tuple(
repo.walk(repo.branches[branch_name.strip()].peel().id, sorting)
for branch_name in branch_names
)
if simplify_first_parent:
for walker in walkers if simplify_first_parent else tuple():
walker.simplify_first_parent()
yield from walker
yield from walkers


@app.command()
Expand All @@ -84,22 +93,15 @@ def commits(
limit: Optional[int] = None,
) -> None:
"""Get the commit ids of a repository."""
repo = Repository(repository)
sorting = GIT_SORT_NONE
if sort == "topological":
sorting = GIT_SORT_TOPOLOGICAL
elif sort == "time":
sorting = GIT_SORT_TIME
if sort_reverse:
sorting |= GIT_SORT_REVERSE
walkers = [
repo.walk(repo.branches[branch_name.strip()].peel().id)
for branch_name in (stdin if branches is None else branches)
]
if simplify_first_parent:
map(lambda walker: walker.simplify_first_parent(), walkers) # type: ignore
commit_ids: Iterable[str] = (
str(commit.id) for walker in walkers for commit in walker
str(commit.id)
for walker in generate_walkers(
Repository(repository),
stdin if branches is None else branches,
simplify_first_parent,
SORTINGS[sort] if not sort_reverse else (SORTINGS[sort] | GIT_SORT_REVERSE),
)
for commit in walker
)
commit_ids = commit_ids if not drop_duplicates else iter_distinct(commit_ids)
commit_ids = commit_ids if limit is None else islice(commit_ids, limit)
Expand All @@ -113,30 +115,15 @@ def analyze(
metrics: Optional[List[AvailableMetrics]] = Argument(None, case_sensitive=False),
commits: Optional[FileText] = None,
workers: int = 1,
global_cache: bool = False,
) -> None:
"""Analyze commits of a repository."""
workers = max(workers, 1)
tree_m, blob_m, unit_m = validate_metrics(metrics)

cache: Dict[str, bool]
if global_cache:
manager = Manager()
cache = manager.dict()
else:
cache = {}
ids = (id.strip() for id in (stdin if commits is None else commits))
with Pool(
max(workers, 1), initialize_worker, (repository, tree_m, blob_m, unit_m, cache)
max(workers, 1), initialize_worker, (repository, *validate_metrics(metrics))
) as pool:
for result in pool.imap(
analyze_worker,
(
commit_id.strip()
for commit_id in (stdin if commits is None else commits)
),
):
if result is not None:
echo(dumps(result, separators=(",", ":"), indent=None))
results = (res for res in pool.imap(analyze_worker, ids) if res is not None)
for result in results:
echo(result)


@app.command()
Expand All @@ -152,7 +139,7 @@ def clone(
def branch(path: Path, local: bool = True, remote: bool = True) -> None:
"""Get the branches of a repository."""
repo = Repository(path)
branches: Iterable[str] = tuple()
branches: Iterable[str]
if local and remote:
branches = repo.branches
elif local:
Expand Down
6 changes: 3 additions & 3 deletions pyrepositoryminer/metrics/visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ def analyze_blob(self, blob: VisitableBlob) -> None:

@final
def visitBlob(self, blob: VisitableBlob) -> TreeVisitor:
if self.is_filtered(blob):
return self
elif self.is_cached(blob):
if self.is_cached(blob):
self.handle_cache_hit(blob)
return self
elif self.is_filtered(blob):
return self
self.cache_blob(blob)
self.analyze_blob(blob)
return self
Expand Down
23 changes: 23 additions & 0 deletions scripts/profile_analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from cProfile import Profile
from pathlib import Path
from pstats import Stats

from typer import FileText

from pyrepositoryminer.main import AvailableMetrics, analyze


def main() -> None:
p = Path("/Users/fabian/bare-repos/numpy.git")
with open("commits copy.txt", "rb") as f:
c = FileText(f)
analyze(p, [AvailableMetrics["filecount"]], c)


if __name__ == "__main__":
profile = Profile()
profile.enable()
main()
profile.disable()
profile_stats = Stats(profile).sort_stats("tottime")
profile_stats.print_stats()
Loading

0 comments on commit fb3e812

Please sign in to comment.