diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..81734aa --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +.github/ +.pre-commit-config.yaml + +Dockerfile + +Makefile + +requirements-dev.txt diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..ba950ad --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,13 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +* @luiyen diff --git a/.github/workflows/test-action.yml b/.github/workflows/test-action.yml new file mode 100644 index 0000000..ce05801 --- /dev/null +++ b/.github/workflows/test-action.yml @@ -0,0 +1,64 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "Test Code Review" + +on: + pull_request: + paths-ignore: + - "*.md" + - "LICENSE" + +jobs: + review: + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - uses: actions/checkout@v3 + - name: "Get diff of the pull request" + id: get_diff + shell: bash + env: + DEFAULT_BRANCH: "${{ github.event.repository.default_branch }}" + PULL_REQUEST_HEAD_REF: "${{ github.event.pull_request.head.ref }}" + run: |- + # Fetch the default branch + git fetch origin "${{ env.DEFAULT_BRANCH }}" + # Exclude png files from diff + git diff "origin/${{ env.DEFAULT_BRANCH }}" ":(exclude)*.png" > "diff.txt" + # Put multi-line string into an environment variable + # shellcheck disable=SC2086 + { + echo "pull_request_diff<> $GITHUB_OUTPUT + - uses: ./ + name: "Code Review" + id: review + with: + apiKey: ${{ secrets.API_KEY }} + githubToken: ${{ secrets.GITHUB_TOKEN }} + githubRepository: ${{ github.repository }} + githubPullRequestNumber: ${{ github.event.pull_request.number }} + gitCommitHash: ${{ github.event.pull_request.head.sha }} + repoId: "microsoft/codereviewer" + temperature: "0.2" + maxNewTokens: "250" + topK: "50" + topP: "0.95" + pullRequestDiff: |- + ${{ steps.get_diff.outputs.pull_request_diff }} + pullRequestChunkSize: "3500" + logLevel: "DEBUG" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9dd0287 --- /dev/null +++ b/.gitignore @@ -0,0 +1,136 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec +!dbt_artifacts_parser/parsers/manifest +!dbt_artifacts_parser/resources/manifest + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# VSCode +.vscode +.secrets +.DS_Store diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..35cd722 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,36 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-yaml + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) + - repo: https://github.com/pycqa/pylint + rev: v2.17.2 + hooks: + - id: pylint + - repo: https://github.com/rhysd/actionlint + rev: v1.6.24 + hooks: + - id: actionlint-docker + - repo: https://github.com/hadolint/hadolint + rev: v2.12.0 + hooks: + - id: hadolint diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..2984f3a --- /dev/null +++ b/.pylintrc @@ -0,0 +1,432 @@ +# This Pylint rcfile contains a best-effort configuration to uphold the +# best-practices and style described in the Google Python style guide: +# https://google.github.io/styleguide/pyguide.html +# +# Its canonical open-source location is: +# https://google.github.io/styleguide/pylintrc + +[MASTER] + +# Files or directories to be skipped. They should be base names, not paths. +ignore=third_party + +# Files or directories matching the regex patterns are skipped. The regex +# matches against base names, not paths. +ignore-patterns= + +# Pickle collected data for later comparisons. +persistent=no + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Use multiple processes to speed up Pylint. +jobs=4 + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +confidence= + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +#enable= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" +disable=abstract-method, + apply-builtin, + arguments-differ, + attribute-defined-outside-init, + backtick, + bad-option-value, + basestring-builtin, + buffer-builtin, + c-extension-no-member, + consider-using-enumerate, + cmp-builtin, + cmp-method, + coerce-builtin, + coerce-method, + delslice-method, + div-method, + duplicate-code, + eq-without-hash, + execfile-builtin, + file-builtin, + filter-builtin-not-iterating, + fixme, + getslice-method, + global-statement, + hex-method, + idiv-method, + implicit-str-concat, + import-error, + import-self, + import-star-module-level, + inconsistent-return-statements, + input-builtin, + intern-builtin, + invalid-str-codec, + locally-disabled, + long-builtin, + long-suffix, + map-builtin-not-iterating, + misplaced-comparison-constant, + missing-function-docstring, + metaclass-assignment, + next-method-called, + next-method-defined, + no-absolute-import, + no-else-break, + no-else-continue, + no-else-raise, + no-else-return, + no-init, # added + no-member, + no-name-in-module, + no-self-use, + nonzero-method, + oct-method, + old-division, + old-ne-operator, + old-octal-literal, + old-raise-syntax, + parameter-unpacking, + print-statement, + raising-string, + range-builtin-not-iterating, + raw_input-builtin, + rdiv-method, + reduce-builtin, + relative-import, + reload-builtin, + round-builtin, + setslice-method, + signature-differs, + standarderror-builtin, + suppressed-message, + sys-max-int, + too-few-public-methods, + too-many-ancestors, + too-many-arguments, + too-many-boolean-expressions, + too-many-branches, + too-many-instance-attributes, + too-many-locals, + too-many-nested-blocks, + too-many-public-methods, + too-many-return-statements, + too-many-statements, + trailing-newlines, + unichr-builtin, + unicode-builtin, + unnecessary-pass, + unpacking-in-except, + useless-else-on-loop, + useless-object-inheritance, + useless-suppression, + using-cmp-argument, + wrong-import-order, + xrange-builtin, + zip-builtin-not-iterating, + missing-module-docstring, + missing-class-docstring, + invalid-name, + + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html. You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages +reports=no + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + + +[BASIC] + +# Good variable names which should always be accepted, separated by a comma +good-names=main,_ + +# Bad variable names which should always be refused, separated by a comma +bad-names= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl + +# Regular expression matching correct function names +function-rgx=^(?:(?PsetUp|tearDown|setUpModule|tearDownModule)|(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ + +# Regular expression matching correct variable names +variable-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct constant names +const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct attribute names +attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ + +# Regular expression matching correct argument names +argument-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class attribute names +class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct inline iteration names +inlinevar-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class names +class-rgx=^_?[A-Z][a-zA-Z0-9]*$ + +# Regular expression matching correct module names +module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$ + +# Regular expression matching correct method names +method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$ + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=10 + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + + +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=120 + +# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt +# lines made too long by directives to pytype. + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=(?x)( + ^\s*(\#\ )??$| + ^\s*(from\s+\S+\s+)?import\s+.+$) + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=yes + +# Maximum number of lines in a module +max-module-lines=99999 + +# String used as indentation unit. The internal Google style guide mandates 2 +# spaces. Google's externaly-published style guide says 4, consistent with +# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google +# projects (like TensorFlow). +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=TODO + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=yes + + +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_,_cb + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging,absl.logging,tensorflow.io.logging + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + + +[SPELLING] + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[IMPORTS] + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=regsub, + TERMIOS, + Bastion, + rexec, + sets + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant, absl + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls, + class_ + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception" +overgeneral-exceptions=StandardError, + Exception, + BaseException diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cb5be56 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.10-slim + +WORKDIR /app + +# Install python packages +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the entrypoint script +COPY entrypoint.py . + +ENTRYPOINT ["python", "/app/entrypoint.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6fbeefa --- /dev/null +++ b/Makefile @@ -0,0 +1,34 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +.PHONE: setup +setup: + pip install --force-reinstall --no-cache pip==23.0.1 setuptools==67.6.1 + pip install --force-reinstall --no-cache -r requirements-dev.txt --use-deprecated=legacy-resolver + +.PHONY: setup-dev +setup-dev: + pip install --force-reinstall --no-cache pip==23.0.1 setuptools==67.6.1 + pip install --force-reinstall --no-cache -r requirements-dev.txt --use-deprecated=legacy-resolver + pre-commit install + +build-docker: + docker build -t gpt-code-review-action . + + +lint: + +lint-python: + pylint --rcfile=.pylintrc gpt_code_review_action + +lint-docker: + hadolint Dockerfile diff --git a/README.md b/README.md new file mode 100644 index 0000000..9c3989e --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +# llm-code-review-action +A container GitHub Action to review a pull request by HuggingFace's LLM Model. + +If the size of a pull request is over the maximum chunk size of the HuggingFace API, the Action will split the pull request into multiple chunks and generate review comments for each chunk. +And then the Action summarizes the review comments and posts a review comment to the pull request. + +## Pre-requisites +We have to set a GitHub Actions secret `HUGGING_FACE_API_KEY` to use the HuggingFace API so that we securely pass it to the Action. + +## Inputs + +- `apiKey`: The HuggingFace API key to access the API. +- `githubToken`: The GitHub token to access the GitHub API. +- `githubRepository`: The GitHub repository to post a review comment. +- `githubPullRequestNumber`: The GitHub pull request number to post a review comment. +- `gitCommitHash`: The git commit hash to post a review comment. +- `pullRequestDiff`: The diff of the pull request to generate a review comment. +- `pullRequestDiffChunkSize`: The chunk size of the diff of the pull request to generate a review comment. +- `repoId`: LLM repository id on HuggingFace. +- `temperature`: The temperature to generate a review comment. +- `topP`: The top_p to generate a review comment. +- `topK`: The top_k to generate a review comment. +- `maxNewTokens`: The max_tokens to generate a review comment. +- `logLevel`: The log level to print logs. + +As you might know, a model of HuggingFace has limitation of the maximum number of input tokens. +So we have to split the diff of a pull request into multiple chunks, if the size of the diff is over the limitation. +We can tune the chunk size based on the model we use. + +## Example usage +Here is an example to use the Action to review a pull request of the repository. +The actual file is located at [`.github/workflows/test-action.yml`](.github/workflows/test-action.yml). + + +```yaml +name: "Test Code Review" + +on: + pull_request: + paths-ignore: + - "*.md" + - "LICENSE" + +jobs: + review: + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - uses: actions/checkout@v3 + - name: "Get diff of the pull request" + id: get_diff + shell: bash + env: + PULL_REQUEST_HEAD_REF: "${{ github.event.pull_request.head.ref }}" + run: |- + git fetch origin "${{ env.PULL_REQUEST_HEAD_REF }}:${{ env.PULL_REQUEST_HEAD_REF }}" + git checkout "${{ env.PULL_REQUEST_HEAD_REF }}" + git diff "origin/${{ env.PULL_REQUEST_HEAD_REF }}" > "diff.txt" + # shellcheck disable=SC2086 + echo "diff=$(cat "diff.txt")" >> $GITHUB_ENV + - uses: luiyen/llm-code-review@v0.0.1 + name: "Code Review" + id: review + with: + apiKey: ${{ secrets.API_KEY }} + githubToken: ${{ secrets.GITHUB_TOKEN }} + githubRepository: ${{ github.repository }} + githubPullRequestNumber: ${{ github.event.pull_request.number }} + gitCommitHash: ${{ github.event.pull_request.head.sha }} + repoId: "microsoft/codereviewer" + temperature: "0.2" + maxNewTokens: "250" + topK: "50" + topP: "0.95" + pullRequestDiff: |- + ${{ steps.get_diff.outputs.pull_request_diff }} + pullRequestChunkSize: "3500" + logLevel: "DEBUG" +``` diff --git a/action.yml b/action.yml new file mode 100644 index 0000000..edf9c0b --- /dev/null +++ b/action.yml @@ -0,0 +1,71 @@ +name: 'LLM Code Review' +description: 'Let LLM model review your code' +author: 'Louis Le (luiyen)' +inputs: + githubToken: + description: 'Github token to access the repo' + required: true + apiKey: + description: 'Huggingface access token from [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)' + required: true + githubRepository: + description: "The GitHub repository to use for the action" + required: true + default: "${{ github.repository }}" + githubPullRequestNumber: + description: "The GitHub pull request number to use for the action" + required: true + default: "${{ github.event.pull_request.number }}" + gitCommitHash: + description: "The GitHub commit hash to use for the action" + required: true + default: "${{ github.event.pull_request.head.sha }}" + repoId: + description: "LLM model" + required: true + default: "microsoft/codereviewer" + maxNewTokens: + description: "The amount of new tokens to be generated, this does not include the input length it is a estimate of the size of generated text you want. Each new tokens slows down the request, so look for balance between response times and length of text generated." + required: false + default: "1024" + temperature: + description: "The temperature of the sampling operation. 1 means regular sampling, 0 means always take the highest score, 100.0 is getting closer to uniform probability." + required: false + default: "0.2" + topK: + description: "Integer to define the top tokens considered within the sample operation to create new text" + required: false + default: "50" + topP: + description: "Float to define the tokens that are within the sample operation of text generation. Add tokens in the sample for more probable to least probable until the sum of the probabilities is greater than top_p." + required: false + default: "0.95" + pullRequestDiff: + description: "Pull request diff" + required: true + pullRequestChunkSize: + description: "Pull request chunk size" + required: false + default: "3500" + logLevel: + description: "Log level" + required: false + default: "INFO" +runs: + using: docker + image: Dockerfile + env: + API_KEY: ${{ inputs.apiKey }} + GITHUB_TOKEN: ${{ inputs.githubToken }} + GITHUB_REPOSITORY: ${{ inputs.githubRepository }} + GITHUB_PULL_REQUEST_NUMBER: ${{ inputs.githubPullRequestNumber }} + GIT_COMMIT_HASH: ${{ inputs.gitCommitHash }} + args: + - "--repo-id=${{ inputs.repoId }}" + - "--temperature=${{ inputs.temperature }}" + - "--max-new-tokens=${{ inputs.maxNewTokens }}" + - "--top-p=${{ inputs.topP }}" + - "--top-k=${{ inputs.topK }}" + - "--diff-chunk-size=${{ inputs.pullRequestChunkSize }}" + - "--diff=${{ inputs.pullRequestDiff }}" + - "--log-level=${{ inputs.logLevel }}" diff --git a/entrypoint.py b/entrypoint.py new file mode 100755 index 0000000..ddd5131 --- /dev/null +++ b/entrypoint.py @@ -0,0 +1,187 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +from typing import List + +import click +import requests +from langchain import HuggingFaceHub, LLMChain, PromptTemplate +from loguru import logger + + +def check_required_env_vars(): + """Check required environment variables""" + required_env_vars = [ + "API_KEY", + "GITHUB_TOKEN", + "GITHUB_REPOSITORY", + "GITHUB_PULL_REQUEST_NUMBER", + "GIT_COMMIT_HASH", + ] + for required_env_var in required_env_vars: + if os.getenv(required_env_var) is None: + raise ValueError(f"{required_env_var} is not set") + + +def create_a_comment_to_pull_request( + github_token: str, + github_repository: str, + pull_request_number: int, + git_commit_hash: str, + body: str): + """Create a comment to a pull request""" + headers = { + "Accept": "application/vnd.github.v3.patch", + "authorization": f"Bearer {github_token}" + } + data = { + "body": body, + "commit_id": git_commit_hash, + "event": "COMMENT" + } + url = f"https://api.github.com/repos/{github_repository}/pulls/{pull_request_number}/reviews" + response = requests.post(url, headers=headers, data=json.dumps(data)) + return response + + +def chunk_string(input_string: str, chunk_size) -> List[str]: + """Chunk a string""" + chunked_inputs = [] + for i in range(0, len(input_string), chunk_size): + chunked_inputs.append(input_string[i:i + chunk_size]) + return chunked_inputs + + +def get_review( + repo_id: str, + diff: str, + temperature: float, + max_new_tokens: int, + top_p: float, + top_k: int, + prompt_chunk_size: int +): + """Get a review""" + # Chunk the prompt + chunked_diff_list = chunk_string(input_string=diff, chunk_size=prompt_chunk_size) + # Get summary by chunk + chunked_reviews = [] + llm = HuggingFaceHub( + repo_id=repo_id, + model_kwargs={"temperature": temperature, + "max_new_tokens": max_new_tokens, + "top_p": top_p, + "top_k": top_k}, + huggingfacehub_api_token=os.getenv("API_KEY") + ) + for chunked_diff in chunked_diff_list: + question=chunked_diff + template = """Provide a concise summary of the bug found in the code, describing its characteristics, + location, and potential effects on the overall functionality and performance of the application. + Present the potential issues and errors first, following by the most important findings, in your summary + Important: Include block of code / diff in the summary also the line number. + ``` + {question} + ``` + """ + + prompt = PromptTemplate(template=template, input_variables=["question"]) + llm_chain = LLMChain(prompt=prompt, llm=llm) + review_result = llm_chain.run(question) + chunked_reviews.append(review_result) + + # If the chunked reviews are only one, return it + if len(chunked_reviews) == 1: + return chunked_reviews, chunked_reviews[0] + + question="\n".join(chunked_reviews) + template = """Summarize the following file changed in a pull request submitted by a developer on GitHub, + focusing on major modifications, additions, deletions, and any significant updates within the files. + Do not include the file name in the summary and list the summary with bullet points. + Important: Include block of code / diff in the summary also the line number. + ``` + {question} + ``` + """ + prompt = PromptTemplate(template=template, input_variables=["question"]) + llm_chain = LLMChain(prompt=prompt, llm=llm) + summarized_review = llm_chain.run(question) + return chunked_reviews, summarized_review + + +def format_review_comment(summarized_review: str, chunked_reviews: List[str]) -> str: + """Format reviews""" + if len(chunked_reviews) == 1: + return summarized_review + unioned_reviews = "\n".join(chunked_reviews) + review = f"""
+ {summarized_review} + {unioned_reviews} +
+ """ + return review + + +@click.command() +@click.option("--diff", type=click.STRING, required=True, help="Pull request diff") +@click.option("--diff-chunk-size", type=click.INT, required=False, default=3500, help="Pull request diff") +@click.option("--repo-id", type=click.STRING, required=False, default="gpt-3.5-turbo", help="Model") +@click.option("--temperature", type=click.FLOAT, required=False, default=0.1, help="Temperature") +@click.option("--max-new-tokens", type=click.INT, required=False, default=250, help="Max tokens") +@click.option("--top-p", type=click.FLOAT, required=False, default=1.0, help="Top N") +@click.option("--top-k", type=click.INT, required=False, default=1.0, help="Top T") +@click.option("--log-level", type=click.STRING, required=False, default="INFO", help="Presence penalty") +def main( + diff: str, + diff_chunk_size: int, + repo_id: str, + temperature: float, + max_new_tokens: int, + top_p: float, + top_k: int, + log_level: str +): + # Set log level + logger.level(log_level) + # Check if necessary environment variables are set or not + check_required_env_vars() + + # Request a code review + chunked_reviews, summarized_review = get_review( + diff=diff, + repo_id=repo_id, + temperature=temperature, + max_new_tokens=max_new_tokens, + top_p=top_p, + top_k=top_k, + prompt_chunk_size=diff_chunk_size + ) + logger.debug(f"Summarized review: {summarized_review}") + logger.debug(f"Chunked reviews: {chunked_reviews}") + + # Format reviews + review_comment = format_review_comment(summarized_review=summarized_review, + chunked_reviews=chunked_reviews) + # Create a comment to a pull request + create_a_comment_to_pull_request( + github_token=os.getenv("GITHUB_TOKEN"), + github_repository=os.getenv("GITHUB_REPOSITORY"), + pull_request_number=int(os.getenv("GITHUB_PULL_REQUEST_NUMBER")), + git_commit_hash=os.getenv("GIT_COMMIT_HASH"), + body=review_comment + ) + + +if __name__ == "__main__": + # pylint: disable=no-value-for-parameter + main() diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..6f76bfa --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,17 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +-r ./requirements.txt + +pytest==4.3.0 + +pre-commit>=3, <4 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b526077 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +click==8.1.3 +loguru==0.7.0 + +requests==2.28.2 + +langchain==0.0.274 +huggingface_hub==0.16.4