diff --git a/README.md b/README.md index 021fe1e4f..f4431af99 100755 --- a/README.md +++ b/README.md @@ -85,7 +85,9 @@ The following is a list of related repositories that we like and think are usefu ## Build Status -| Build Type | Branch | Status | | Branch | Status | -| --- | --- | --- | --- | --- | --- | -| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=staging) | -| **Linux GPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=staging) | +| Build | Branch | Status | +| --- | --- | --- | +| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=master) | +| **Linux CPU** | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=staging) | +| **Linux GPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) | +| **Linux GPU** | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=staging) | diff --git a/examples/README.md b/examples/README.md index 4edd96c1a..2046febfe 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,7 +2,6 @@ This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for the following scenarios. - |Category|Applications|Methods|Languages| |---| ------------------------ | ------------------- |---| |[Text Classification](text_classification)|Topic Classification|BERT, XLNet|en, hi, ar| @@ -14,3 +13,11 @@ This folder contains examples and best practices, written in Jupyter notebooks, |[Annotation](annotation)|Text Annotation|Doccano|| |[Model Explainability](model_explainability)|DNN Layer Explanation|DUUDNM (Guan et al.)| +## Data/Telemetry +The Azure Machine Learning notebooks collect browser usage data and send it to Microsoft to help improve our products and services. Read Microsoft's [privacy statement to learn more](https://privacy.microsoft.com/en-US/privacystatement). + +To opt out of tracking, please go to the raw `.ipynb` files and remove the following line of code (the URL will be slightly different depending on the file): + +```sh + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/text_classification/tc_bert_azureml.png)" +``` \ No newline at end of file diff --git a/examples/entailment/entailment_xnli_bert_azureml.ipynb b/examples/entailment/entailment_xnli_bert_azureml.ipynb index 3598ea926..af049f3cc 100644 --- a/examples/entailment/entailment_xnli_bert_azureml.ipynb +++ b/examples/entailment/entailment_xnli_bert_azureml.ipynb @@ -14,6 +14,13 @@ "\n", "**Note: To learn how to do pre-training on your own, please reference the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT) created by Microsoft.**" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/entailment/entailment_xnli_bert_azureml.png)" + ] }, { "cell_type": "code", diff --git a/examples/question_answering/bidaf_aml_deep_dive.ipynb b/examples/question_answering/bidaf_aml_deep_dive.ipynb index de13e4f8c..cf75d5de4 100644 --- a/examples/question_answering/bidaf_aml_deep_dive.ipynb +++ b/examples/question_answering/bidaf_aml_deep_dive.ipynb @@ -15,6 +15,13 @@ "source": [ "# BiDAF Model Deep Dive on AzureML" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/question_answering/bidaf_aml_deep_dive.png)" + ] }, { "cell_type": "markdown", diff --git a/examples/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb b/examples/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb index 093735442..fdc089119 100644 --- a/examples/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb +++ b/examples/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb @@ -16,6 +16,13 @@ "# Question Answering: Fine-Tune BERT on AzureML (PyTorch)\n", "**BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding** [\\[1\\]](#References)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/question_answering/pretrained_BERT_SQuAD_deep_dive_aml.png)" + ] }, { "cell_type": "markdown", diff --git a/examples/question_answering/question_answering_system_bidaf_quickstart.ipynb b/examples/question_answering/question_answering_system_bidaf_quickstart.ipynb index 17812006c..b3ed49196 100644 --- a/examples/question_answering/question_answering_system_bidaf_quickstart.ipynb +++ b/examples/question_answering/question_answering_system_bidaf_quickstart.ipynb @@ -15,6 +15,13 @@ "), [BiDAF](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02\n", "), using Azure Container Instances ([ACI](https://azure.microsoft.com/en-us/services/container-instances/))." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/question_answering/bidaf_quickstart.png)" + ] }, { "cell_type": "markdown", diff --git a/examples/sentence_similarity/automl_local_deployment_aci.ipynb b/examples/sentence_similarity/automl_local_deployment_aci.ipynb index fd1b25b8d..b67c9b55d 100644 --- a/examples/sentence_similarity/automl_local_deployment_aci.ipynb +++ b/examples/sentence_similarity/automl_local_deployment_aci.ipynb @@ -15,6 +15,13 @@ "source": [ "# Local Automated Machine Learning Model with ACI Deployment for Predicting Sentence Similarity" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/sentence_similarity/automl_local_deployment_aci.png)" + ] }, { "cell_type": "markdown", diff --git a/examples/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb b/examples/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb index ed0826af5..42d85e42a 100644 --- a/examples/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb +++ b/examples/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb @@ -15,6 +15,13 @@ "source": [ "# AzureML Pipeline, AutoML, AKS Deployment for Sentence Similarity" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/sentence_similarity/automl_with_pipelines_deployment_aks.png)" + ] }, { "cell_type": "markdown", diff --git a/examples/sentence_similarity/bert_senteval.ipynb b/examples/sentence_similarity/bert_senteval.ipynb index c8d6c9996..c423aa715 100644 --- a/examples/sentence_similarity/bert_senteval.ipynb +++ b/examples/sentence_similarity/bert_senteval.ipynb @@ -6,6 +6,13 @@ "source": [ "# Parallel Experimentation with BERT on AzureML" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/sentence_similarity/bert_senteval.png)" + ] }, { "cell_type": "markdown", diff --git a/examples/sentence_similarity/gensen_aml_deep_dive.ipynb b/examples/sentence_similarity/gensen_aml_deep_dive.ipynb index 00f48a85d..23c4a0e05 100644 --- a/examples/sentence_similarity/gensen_aml_deep_dive.ipynb +++ b/examples/sentence_similarity/gensen_aml_deep_dive.ipynb @@ -16,6 +16,13 @@ "# Training GenSen on AzureML with SNLI Dataset\n", "**GenSen: Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning** [\\[1\\]](#References)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/sentence_similarity/gensen_aml_deep_dive.png)" + ] }, { "cell_type": "markdown", diff --git a/examples/text_classification/tc_bert_azureml.ipynb b/examples/text_classification/tc_bert_azureml.ipynb index f5cfdc5ee..3e6e4aca3 100644 --- a/examples/text_classification/tc_bert_azureml.ipynb +++ b/examples/text_classification/tc_bert_azureml.ipynb @@ -11,6 +11,13 @@ "# Text Classification of MultiNLI Sentences using BERT with Azure ML Pipelines" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/text_classification/tc_bert_azureml.png)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/examples/text_classification/tc_mnli_bert.ipynb b/examples/text_classification/tc_mnli_bert.ipynb index fd9e5ccd3..7712416a4 100644 --- a/examples/text_classification/tc_mnli_bert.ipynb +++ b/examples/text_classification/tc_mnli_bert.ipynb @@ -60,8 +60,7 @@ "import torch\n", "import torch.nn as nn\n", "\n", - "from utils_nlp.dataset.multinli import load_pandas_df\n", - "from utils_nlp.eval.classification import eval_classification\n", + "from utils_nlp.dataset.multinli import load_pandas_df\n", "from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n", "from utils_nlp.models.bert.common import Language, Tokenizer\n", "from utils_nlp.common.timer import Timer" diff --git a/tests/integration/test_notebooks_text_classification.py b/tests/integration/test_notebooks_text_classification.py index 54f905cf6..07eee510b 100644 --- a/tests/integration/test_notebooks_text_classification.py +++ b/tests/integration/test_notebooks_text_classification.py @@ -49,18 +49,19 @@ def test_tc_dac_bert_ar(notebooks, tmp): NUM_GPUS=1, DATA_FOLDER=tmp, BERT_CACHE_DIR=tmp, - BATCH_SIZE=32, + MAX_LEN=175, + BATCH_SIZE=16, NUM_EPOCHS=1, TRAIN_SIZE=0.8, - NUM_ROWS=15000, + NUM_ROWS=8000, RANDOM_STATE=0, ), ) result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict - assert pytest.approx(result["accuracy"], 0.93, abs=ABS_TOL) - assert pytest.approx(result["precision"], 0.91, abs=ABS_TOL) - assert pytest.approx(result["recall"], 0.91, abs=ABS_TOL) - assert pytest.approx(result["f1"], 0.91, abs=ABS_TOL) + assert pytest.approx(result["accuracy"], 0.871, abs=ABS_TOL) + assert pytest.approx(result["precision"], 0.865, abs=ABS_TOL) + assert pytest.approx(result["recall"], 0.852, abs=ABS_TOL) + assert pytest.approx(result["f1"], 0.845, abs=ABS_TOL) @pytest.mark.gpu diff --git a/tools/repo_metrics/README.md b/tools/repo_metrics/README.md deleted file mode 100755 index 8d14448c2..000000000 --- a/tools/repo_metrics/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Repository Metrics - -[![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/repo_metrics?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=36&branchName=master) - -We developed a script that allows us to track the repo metrics. Some of the metrics we can track are listed here: - -* Number of stars -* Number of forks -* Number of clones -* Number of views -* Number of lines of code - -To see the full list of metrics, see [git_stats.py](git_stats.py) - -The first step is to set up the credentials, copy the configuration file and fill up the credentials of GitHub and CosmosDB: - - cp tools/repo_metrics/config_template.py tools/repo_metrics/config.py - -To track the current state of the repository and save it to CosmosDB: - - python tools/repo_metrics/track_metrics.py --github_repo "https://github.com/Microsoft/NLP" --save_to_database - -To track an event related to this repository and save it to CosmosDB: - - python tools/repo_metrics/track_metrics.py --event "Today we did our first blog of the project" --event_date 2018-12-01 --save_to_database - - -### Setting up Azure CosmosDB - -The API that we is used to track the GitHub metrics is the [Mongo API](https://docs.microsoft.com/en-us/azure/cosmos-db/mongodb-introduction). - -The database name and collections name are defined in the [config file](config_template.py). There are two main collections, defined as `COLLECTION_GITHUB_STATS` and `COLLECTION_EVENTS` to store the information defined on the previous section. - -**IMPORTANT NOTE**: If the database and the collections are created directly through the portal, a common partition key should be defined. We recommend to use `date` as partition key. - - diff --git a/tools/repo_metrics/__init__.py b/tools/repo_metrics/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/tools/repo_metrics/config_template.py b/tools/repo_metrics/config_template.py deleted file mode 100755 index dad9a0950..000000000 --- a/tools/repo_metrics/config_template.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -# Github token -# More info: https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/ -GITHUB_TOKEN = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" - -# CosmosDB Mongo API -CONNECTION_STRING = "mongodb://XXXXXXXXXXXXXXXXXXXXXXXXX.documents.azure.com:10255/?ssl=true&replicaSet=globaldb" -DATABASE = "nlp_stats" -COLLECTION_GITHUB_STATS = "github_stats" -COLLECTION_EVENTS = "events" - diff --git a/tools/repo_metrics/git_stats.py b/tools/repo_metrics/git_stats.py deleted file mode 100755 index 8faa4366f..000000000 --- a/tools/repo_metrics/git_stats.py +++ /dev/null @@ -1,425 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -# Code based on: https://github.com/miguelgfierro/codebase/blob/master/python/utilities/git_stats.py -# - -import git -import os -import requests -import datetime -from functools import lru_cache -import shutil - - -END_POINT = "https://api.github.com/repos/" -SEARCH_END_POINT = "https://api.github.com/search/" -BASE_URL = "https://github.com/" - - -class Github: - """Github stats class""" - - def __init__(self, token, git_url): - """Initializer. - Args: - token (str): Github token. - git_url (str): URL of github repository. - """ - self.token = token - self.git_url = git_url - self.repo_name = self.git_url.split(BASE_URL)[1] - self.api_url = END_POINT + self.repo_name - self.headers = {"Authorization": "token " + self.token} - - @property - @lru_cache() - def general_stats(self): - """General attributes and statistics of the repo. - Returns: - json: JSON with general stats. - """ - r = requests.get(self.api_url, headers=self.headers) - if r.ok: - return r.json() - else: - return None - - @property - def forks(self): - """Get current number of forks. - Returns: - int: Number of forks. - """ - return ( - self.general_stats["forks_count"] - if self.general_stats is not None - else None - ) - - @property - @lru_cache() - def open_issues(self): - """Get current number of open issues. - Returns: - int: Number of issues. - """ - url = ( - SEARCH_END_POINT - + "issues?q=state%3Aopen+repo:" - + self.repo_name - + "+type%3Aissues" - ) - r = requests.get(url, headers=self.headers) - if r.ok: - return r.json()["total_count"] - else: - return None - - @property - @lru_cache() - def open_pull_requests(self): - """Get current number of open PRs. - Returns: - int: Number of PRs. - """ - url = ( - SEARCH_END_POINT - + "issues?q=state%3Aopen+repo:" - + self.repo_name - + "+type%3Apr" - ) - r = requests.get(url, headers=self.headers) - if r.ok: - return r.json()["total_count"] - else: - return None - - @property - def stars(self): - """Get current number of stars. - Returns: - int: Number of stars. - """ - return ( - self.general_stats["stargazers_count"] - if self.general_stats is not None - else None - ) - - @property - def watchers(self): - """Get current number of watchers. - Returns: - int: Number of watchers. - """ - return ( - self.general_stats["stargazers_count"] - if self.general_stats is not None - else None - ) - - @property - @lru_cache() - def last_year_commit_frequency(self): - """Get the commit frequency in every week of the last year. - Returns: - dict: Dictionary of 52 elements (1 per week) with the commits every day - (starting on Sunday), total commit sum and first day of the week. - """ - r = requests.get(self.api_url + "/stats/commit_activity", headers=self.headers) - if r.ok: - resp = r.json() - else: - return None - for id, item in enumerate(resp): - week_str = datetime.datetime.fromtimestamp(item["week"]).strftime( - "%Y-%m-%d" - ) - resp[id]["week"] = week_str - return resp - - @property - @lru_cache() - def top_ten_referrers(self): - """Get the top 10 referrers over the last 14 days. - Source: https://developer.github.com/v3/repos/traffic/#list-referrers - Returns: - json: JSON with referrer name, total number of references - and unique number of references. - """ - r = requests.get( - self.api_url + "/traffic/popular/referrers", headers=self.headers - ) - if r.ok: - return r.json() - else: - return None - - @property - def number_total_referrers(self): - """Count the total number of references to the repo. - Returns: - int: Number. - """ - return ( - sum(item["count"] for item in self.top_ten_referrers) - if self.top_ten_referrers is not None - else None - ) - - @property - def number_unique_referrers(self): - """Count the unique number of references to the repo. - Returns: - int: Number. - """ - return ( - sum(item["uniques"] for item in self.top_ten_referrers) - if self.top_ten_referrers is not None - else None - ) - - @property - @lru_cache() - def top_ten_content(self): - """Get the top 10 popular contents within the repo over the last 14 days. - Source: https://developer.github.com/v3/repos/traffic/#list-paths - Returns: - json: JSON with the content link, total and unique views. - """ - r = requests.get(self.api_url + "/traffic/popular/paths", headers=self.headers) - if r.ok: - return r.json() - else: - return None - - @property - @lru_cache() - def views(self): - """Get the total number of views and breakdown per day or week for the - last 14 days. Timestamps are aligned to UTC midnight of the beginning of - the day or week. Week begins on Monday. - Source: https://developer.github.com/v3/repos/traffic/#views - Returns: - json: JSON with daily views. - """ - r = requests.get(self.api_url + "/traffic/views", headers=self.headers) - if r.ok: - return r.json() - else: - return None - - @property - def number_total_views(self): - """Total number of views over the last 14 days - Returns: - int: Views. - """ - return self.views["count"] if self.views is not None else None - - @property - def number_unique_views(self): - """Unique number of views over the last 14 days - Returns: - int: Views. - """ - return self.views["uniques"] if self.views is not None else None - - @property - @lru_cache() - def clones(self): - """Get the total number of clones and breakdown per day or week for the last - 14 days. Timestamps are aligned to UTC midnight of the beginning of the day - or week. Week begins on Monday. - Source: https://developer.github.com/v3/repos/traffic/#clones - Returns: - json: JSON with daily clones. - """ - r = requests.get(self.api_url + "/traffic/clones", headers=self.headers) - if r.ok: - return r.json() - else: - return None - - @property - def number_total_clones(self): - """Total number of clones over the last 14 days - Returns: - int: Clones. - """ - return self.clones["count"] if self.clones is not None else None - - @property - def number_unique_clones(self): - """Unique number of clones over the last 14 days - Returns: - int: Clones. - """ - return self.clones["uniques"] if self.clones is not None else None - - @property - def repo_size(self): - """Repo size in Mb - Returns: - int: Size. - """ - return self.general_stats["size"] if self.general_stats is not None else None - - @property - def creation_date(self): - """Date of repository creation - Returns: - str: Date. - """ - return ( - self.general_stats["created_at"] if self.general_stats is not None else None - ) - - @property - @lru_cache() - def languages(self): - """Get the languages in the repo and the lines of code of each. - Source: https://developer.github.com/v3/repos/#list-languages - Returns: - dict: Dictionary of languages and lines of code. - """ - r = requests.get(self.api_url + "/languages", headers=self.headers) - if r.ok: - return r.json() - else: - return None - - @property - def number_languages(self): - """Number of different languages - Returns: - int: Number - """ - return len(self.languages) if self.languages is not None else None - - @property - def number_commits(self): - """Get total number of commits. - NOTE: There is no straightforward way of getting the commits with GitHub API - https://blog.notfoss.com/posts/get-total-number-of-commits-for-a-repository-using-the-github-api/ - Returns: - int: Number of commits. - """ - if self._cloned_repo_dir() is None: - return None - os.chdir(self._cloned_repo_dir()) - resp = os.popen("git rev-list HEAD --count").read() - resp = int(resp.split("\n")[0]) - os.chdir("..") - return resp - - @property - def number_contributors(self): - """Count the total number of contributors, based on unique email addresses. - Returns: - int: Number of contributors. - """ - if self._cloned_repo_dir() is None: - return None - os.chdir(self._cloned_repo_dir()) - resp = os.popen('git log --format="%aN" | sort -u | wc -l').read() - os.chdir("..") - resp = int(resp.split("\n")[0]) - return resp - - @property - def number_branches(self): - """Number of current remote branches. - Returns: - int: Number. - """ - if self._cloned_repo_dir() is None: - return None - os.chdir(self._cloned_repo_dir()) - resp = os.popen("git ls-remote --heads origin | wc -l").read() - os.chdir("..") - resp = int(resp.split("\n")[0]) - return resp - - @property - def number_tags(self): - """Number of tags. - Returns: - int: Number. - """ - if self._cloned_repo_dir() is None: - return None - os.chdir(self._cloned_repo_dir()) - resp = os.popen("git tag | wc -l").read() - os.chdir("..") - resp = int(resp.split("\n")[0]) - return resp - - @property - def number_total_lines(self): - """Count total number of lines. - Returns: - int: Number of lines. - """ - return sum(self.languages.values()) if self.languages is not None else None - - @property - def number_added_lines(self): - """Count the number of added lines. - Returns: - int: Number of added lines. - """ - if self._cloned_repo_dir() is None: - return None - os.chdir(self._cloned_repo_dir()) - resp = os.popen( - "git log --pretty=tformat: --numstat | awk '{ add += $1 } END { printf \"%s\",add }'" - ).read() - os.chdir("..") - resp = int(resp) - return resp - - @property - def number_deleted_lines(self): - """Get the number of deleted lines. - Returns: - int: Number of deleted lines. - """ - if self._cloned_repo_dir() is None: - return None - os.chdir(self._cloned_repo_dir()) - resp = os.popen( - "git log --pretty=tformat: --numstat | awk '{ add += $1 ; subs += $2 } END { printf \"%s\",subs }'" - ).read() - os.chdir("..") - resp = int(resp) - return resp - - def clean(self): - if self._cloned_repo_dir() is not None: - shutil.rmtree(self._cloned_repo_dir(), ignore_errors=True) - - def _cloned_repo_dir(self): - """Clone a git repo and returns the location. - Returns: - str: Name of the folder name of the repo. - """ - repo_dir = self.git_url.split("/")[-1] - if os.path.isdir(repo_dir): - return repo_dir - try: - git.Repo.clone_from(self.git_url, repo_dir) - except git.GitCommandError: - # try with token in case it is a private repo - private_url = ( - "https://" - + self.token - + ":x-oauth-basic@github.com/" - + self.git_url.split(BASE_URL)[1] - ) - git.Repo.clone_from(private_url, repo_dir) - if os.path.isdir(repo_dir): - return repo_dir - else: - return None diff --git a/tools/repo_metrics/track_metrics.py b/tools/repo_metrics/track_metrics.py deleted file mode 100755 index 5580ec9f0..000000000 --- a/tools/repo_metrics/track_metrics.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -import sys -import os - -# Need to append a full path instead of relative path. -# NOTE this does not affect running directly in the shell. -sys.path.append(os.getcwd()) -import argparse -import traceback -import logging -from datetime import datetime -from dateutil.parser import isoparse -from pymongo import MongoClient -from tools.repo_metrics.git_stats import Github -from tools.repo_metrics.config import ( - GITHUB_TOKEN, - CONNECTION_STRING, - DATABASE, - COLLECTION_GITHUB_STATS, - COLLECTION_EVENTS, -) - -format_str = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)s]: %(message)s" -format_time = "%Y-%m-%d %H:%M:%S" -logging.basicConfig(level=logging.INFO, format=format_str, datefmt=format_time) -log = logging.getLogger() - - -def parse_args(): - """Argument parser. - - Returns: - obj: Parser. - """ - parser = argparse.ArgumentParser( - description="Metrics Tracker", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument("--github_repo", type=str, help="GitHub repository") - parser.add_argument( - "--event", - type=str, - help="Input a general event that can be saved to the database", - ) - parser.add_argument( - "--save_to_database", - action="store_true", - help="Whether or not to save the information to the database", - ) - parser.add_argument( - "--event_date", - default=datetime.now().isoformat(), - type=isoparse, - help="Date for an event (format: YYYY-MM-DD)", - ) - return parser.parse_args() - - -def connect(uri="mongodb://localhost"): - """Mongo connector. - - Args: - uri (str): Connection string. - - Returns: - obj: Mongo client. - """ - client = MongoClient(uri, serverSelectionTimeoutMS=1000) - - # Send a query to the server to see if the connection is working. - try: - client.server_info() - except Exception: - raise - return client - - -def event_as_dict(event, date): - """Encodes an string event input as a dictionary with the date. - - Args: - event (str): Details of a event. - date (datetime): Date of the event. - - Returns: - dict: Dictionary with the event and the date. - """ - return {"date": date.strftime("%b %d %Y %H:%M:%S"), "event": event} - - -def github_stats_as_dict(github): - """Encodes Github statistics as a dictionary with the date. - - Args: - obj: Github object. - - Returns: - dict: Dictionary with Github details and the date. - """ - return { - "date": datetime.now().strftime("%b %d %Y %H:%M:%S"), - "stars": github.stars, - "forks": github.forks, - "watchers": github.watchers, - "open_issues": github.open_issues, - "open_pull_requests": github.open_pull_requests, - "unique_views": github.number_unique_views, - "total_views": github.number_total_views, - "details_views": github.views, - "unique_clones": github.number_unique_clones, - "total_clones": github.number_total_clones, - "details_clones": github.clones, - "last_year_commit_frequency": github.last_year_commit_frequency, - "details_referrers": github.top_ten_referrers, - "total_referrers": github.number_total_referrers, - "unique_referrers": github.number_unique_referrers, - "details_content": github.top_ten_content, - "repo_size": github.repo_size, - "commits": github.number_commits, - "contributors": github.number_contributors, - "branches": github.number_branches, - "tags": github.number_tags, - "total_lines": github.number_total_lines, - "added_lines": github.number_added_lines, - "deleted_lines": github.number_deleted_lines, - } - - -def tracker(args): - """Main function to track metrics. - - Args: - args (obj): Parsed arguments. - """ - if args.github_repo: - # if there is an env variable, overwrite it - token = os.environ.get("GITHUB_TOKEN", GITHUB_TOKEN) - g = Github(token, args.github_repo) - git_doc = github_stats_as_dict(g) - log.info("GitHub stats -- {}".format(git_doc)) - g.clean() - - if args.event: - event_doc = event_as_dict(args.event, args.event_date) - log.info("Event -- {}".format(event_doc)) - - if args.save_to_database: - # if there is an env variable, overwrite it - connection = token = os.environ.get("CONNECTION_STRING", CONNECTION_STRING) - cli = connect(connection) - db = cli[DATABASE] - if args.github_repo: - db[COLLECTION_GITHUB_STATS].insert_one(git_doc) - if args.event: - db[COLLECTION_EVENTS].insert_one(event_doc) - - -if __name__ == "__main__": - log.info("Starting routine") - args = parse_args() - try: - log.info("Arguments: {}".format(args)) - tracker(args) - except Exception as e: - trace = traceback.format_exc() - log.error("Traceback: {}".format(trace)) - log.error("Exception: {}".format(e)) - finally: - log.info("Routine finished") diff --git a/utils_nlp/__init__.py b/utils_nlp/__init__.py index fb940c1fc..00b8e0f1f 100755 --- a/utils_nlp/__init__.py +++ b/utils_nlp/__init__.py @@ -16,6 +16,13 @@ COPYRIGHT = __copyright__ # Determine semantic versioning automatically -# from git commits -__version__ = get_version() +# from git commits if the package is installed +# into your environment, otherwise +# we set version to default for development +try: + __version__ = get_version() +except LookupError: + __version__ = "0.0.0" + VERSION = __version__ +