diff --git a/.github/workflows/build-and-test.yaml b/.github/workflows/build-and-test.yaml index 960da61bf..bfe87dbdd 100644 --- a/.github/workflows/build-and-test.yaml +++ b/.github/workflows/build-and-test.yaml @@ -2,6 +2,11 @@ name: Build-and-test on: push: + branches: + - main + pull_request: + branches: + - main jobs: build-and-test: @@ -31,6 +36,6 @@ jobs: - id: test name: Test dev docker run: | - docker run --rm codiumai/pr-agent:test pytest -v + docker run --rm codiumai/pr-agent:test pytest -v tests/unittest diff --git a/.github/workflows/code_coverage.yaml b/.github/workflows/code_coverage.yaml new file mode 100644 index 000000000..136ed9a7c --- /dev/null +++ b/.github/workflows/code_coverage.yaml @@ -0,0 +1,54 @@ +name: Code-coverage + +on: + workflow_dispatch: + # push: + # branches: + # - main + pull_request: + branches: + - main + +jobs: + build-and-test: + runs-on: ubuntu-latest + + steps: + - id: checkout + uses: actions/checkout@v2 + + - id: dockerx + name: Setup Docker Buildx + uses: docker/setup-buildx-action@v2 + + - id: build + name: Build dev docker + uses: docker/build-push-action@v2 + with: + context: . + file: ./docker/Dockerfile + push: false + load: true + tags: codiumai/pr-agent:test + cache-from: type=gha,scope=dev + cache-to: type=gha,mode=max,scope=dev + target: test + + - id: code_cov + name: Test dev docker + run: | + docker run --name test_container codiumai/pr-agent:test pytest tests/unittest --cov=pr_agent --cov-report term --cov-report xml:coverage.xml + docker cp test_container:/app/coverage.xml coverage.xml + docker rm test_container + + + - name: Validate coverage report + run: | + if [ ! -f coverage.xml ]; then + echo "Coverage report not found" + exit 1 + fi + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/docs-ci.yaml b/.github/workflows/docs-ci.yaml new file mode 100644 index 000000000..b260039ec --- /dev/null +++ b/.github/workflows/docs-ci.yaml @@ -0,0 +1,33 @@ +name: docs-ci +on: + push: + branches: + - main + - add-docs-portal + paths: + - docs/** +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: pip install mkdocs-material + - run: pip install "mkdocs-material[imaging]" + - run: pip install mkdocs-glightbox + - run: mkdocs gh-deploy -f docs/mkdocs.yml --force diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml new file mode 100644 index 000000000..e49bcea39 --- /dev/null +++ b/.github/workflows/e2e_tests.yaml @@ -0,0 +1,46 @@ +name: PR-Agent E2E tests + +on: + workflow_dispatch: +# schedule: +# - cron: '0 0 * * *' # This cron expression runs the workflow every night at midnight UTC + +jobs: + pr_agent_job: + runs-on: ubuntu-latest + name: PR-Agent E2E GitHub App Test + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Setup Docker Buildx + uses: docker/setup-buildx-action@v2 + + - id: build + name: Build dev docker + uses: docker/build-push-action@v2 + with: + context: . + file: ./docker/Dockerfile + push: false + load: true + tags: codiumai/pr-agent:test + cache-from: type=gha,scope=dev + cache-to: type=gha,mode=max,scope=dev + target: test + + - id: test1 + name: E2E test github app + run: | + docker run -e GITHUB.USER_TOKEN=${{ secrets.TOKEN_GITHUB }} --rm codiumai/pr-agent:test pytest -v tests/e2e_tests/test_github_app.py + + - id: test2 + name: E2E gitlab webhook + run: | + docker run -e gitlab.PERSONAL_ACCESS_TOKEN=${{ secrets.TOKEN_GITLAB }} --rm codiumai/pr-agent:test pytest -v tests/e2e_tests/test_gitlab_webhook.py + + + - id: test3 + name: E2E bitbucket app + run: | + docker run -e BITBUCKET.USERNAME=${{ secrets.BITBUCKET_USERNAME }} -e BITBUCKET.PASSWORD=${{ secrets.BITBUCKET_PASSWORD }} --rm codiumai/pr-agent:test pytest -v tests/e2e_tests/test_bitbucket_app.py \ No newline at end of file diff --git a/.github/workflows/pr-agent-review.yaml b/.github/workflows/pr-agent-review.yaml index 9dcf59b89..aa7a8fe01 100644 --- a/.github/workflows/pr-agent-review.yaml +++ b/.github/workflows/pr-agent-review.yaml @@ -5,8 +5,9 @@ name: PR-Agent on: - pull_request: - issue_comment: +# pull_request: +# issue_comment: + workflow_dispatch: permissions: issues: write @@ -24,4 +25,11 @@ jobs: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} OPENAI_ORG: ${{ secrets.OPENAI_ORG }} # optional GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PINECONE.API_KEY: ${{ secrets.PINECONE_API_KEY }} + PINECONE.ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }} + GITHUB_ACTION_CONFIG.AUTO_DESCRIBE: true + GITHUB_ACTION_CONFIG.AUTO_REVIEW: true + GITHUB_ACTION_CONFIG.AUTO_IMPROVE: true + + diff --git a/.gitignore b/.gitignore index 3b50f4d87..9fcb91939 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,11 @@ .idea/ +.lsp/ +.vscode/ venv/ pr_agent/settings/.secrets.toml __pycache__ dist/ *.egg-info/ build/ -review.md +.DS_Store +docs/.cache/ \ No newline at end of file diff --git a/.pr_agent.toml b/.pr_agent.toml new file mode 100644 index 000000000..05e013be7 --- /dev/null +++ b/.pr_agent.toml @@ -0,0 +1,6 @@ +[pr_reviewer] +enable_review_labels_effort = true +enable_auto_approval = true + +[config] +model="claude-3-5-sonnet" diff --git a/CONFIGURATION.md b/CONFIGURATION.md deleted file mode 100644 index 9bbfd9109..000000000 --- a/CONFIGURATION.md +++ /dev/null @@ -1,57 +0,0 @@ -## Configuration - -The different tools and sub-tools used by CodiumAI PR-Agent are adjustable via the **[configuration file](pr_agent/settings/configuration.toml)** - -### Working from CLI -When running from source (CLI), your local configuration file will be initially used. - -Example for invoking the 'review' tools via the CLI: - -``` -python cli.py --pr-url= review -``` -In addition to general configurations, the 'review' tool will use parameters from the `[pr_reviewer]` section (every tool has a dedicated section in the configuration file). - -Note that you can print results locally, without publishing them, by setting in `configuration.toml`: - -``` -[config] -publish_output=true -verbosity_level=2 -``` -This is useful for debugging or experimenting with the different tools. - -### Working from pre-built repo (GitHub Action/GitHub App/Docker) -When running PR-Agent from a pre-built repo, the default configuration file will be loaded. - -To edit the configuration, you have two options: -1. Place a local configuration file in the root of your local repo. The local file will be used instead of the default one. -2. For online usage, just add `--config_path=` to you command, to edit a specific configuration value. -For example if you want to edit `pr_reviewer` configurations, you can run: -``` -/review --pr_reviewer.extra_instructions="..." --pr_reviewer.require_score_review=false ... -``` - -Any configuration value in `configuration.toml` file can be similarly edited. - -### General configuration parameters - -#### Changing a model -See [here](pr_agent/algo/__init__.py) for the list of available models. - -To use Llama2 model, for example, set: -``` -[config] -model = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" -[replicate] -key = ... -``` -(you can obtain a Llama2 key from [here](https://replicate.com/replicate/llama-2-70b-chat/api)) - -Also review the [AiHandler](pr_agent/algo/ai_handler.py) file for instruction how to set keys for other models. - -#### Extra instructions -All PR-Agent tools have a parameter called `extra_instructions`, that enables to add free-text extra instructions. Example usage: -``` -/update_changelog --pr_update_changelog.extra_instructions="Make sure to update also the version ..." -``` \ No newline at end of file diff --git a/Dockerfile.github_action b/Dockerfile.github_action index d6763f0a8..dd1ae7501 100644 --- a/Dockerfile.github_action +++ b/Dockerfile.github_action @@ -2,7 +2,8 @@ FROM python:3.10 as base WORKDIR /app ADD pyproject.toml . -RUN pip install . && rm pyproject.toml +ADD requirements.txt . +RUN pip install . && rm pyproject.toml requirements.txt ENV PYTHONPATH=/app ADD pr_agent pr_agent ADD github_action/entrypoint.sh / diff --git a/INSTALL.md b/INSTALL.md deleted file mode 100644 index 4589e30fd..000000000 --- a/INSTALL.md +++ /dev/null @@ -1,219 +0,0 @@ - -## Installation - ---- - -#### Method 1: Use Docker image (no installation required) - -To request a review for a PR, or ask a question about a PR, you can run directly from the Docker image. Here's how: - -1. To request a review for a PR, run the following command: - -``` -docker run --rm -it -e OPENAI.KEY= -e GITHUB.USER_TOKEN= codiumai/pr-agent --pr_url review -``` - -2. To ask a question about a PR, run the following command: - -``` -docker run --rm -it -e OPENAI.KEY= -e GITHUB.USER_TOKEN= codiumai/pr-agent --pr_url ask "" -``` - -Possible questions you can ask include: - -- What is the main theme of this PR? -- Is the PR ready for merge? -- What are the main changes in this PR? -- Should this PR be split into smaller parts? -- Can you compose a rhymed song about this PR? - ---- - -#### Method 2: Run as a GitHub Action - -You can use our pre-built Github Action Docker image to run PR-Agent as a Github Action. - -1. Add the following file to your repository under `.github/workflows/pr_agent.yml`: - -```yaml -on: - pull_request: - issue_comment: -jobs: - pr_agent_job: - runs-on: ubuntu-latest - name: Run pr agent on every pull request, respond to user comments - steps: - - name: PR Agent action step - id: pragent - uses: Codium-ai/pr-agent@main - env: - OPENAI_KEY: ${{ secrets.OPENAI_KEY }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -``` - -2. Add the following secret to your repository under `Settings > Secrets`: - -``` -OPENAI_KEY: -``` - -The GITHUB_TOKEN secret is automatically created by GitHub. - -3. Merge this change to your main branch. -When you open your next PR, you should see a comment from `github-actions` bot with a review of your PR, and instructions on how to use the rest of the tools. - -4. You may configure PR-Agent by adding environment variables under the env section corresponding to any configurable property in the [configuration](./CONFIGURATION.md) file. Some examples: -```yaml - env: - # ... previous environment values - OPENAI.ORG: "" - PR_REVIEWER.REQUIRE_TESTS_REVIEW: "false" # Disable tests review - PR_CODE_SUGGESTIONS.NUM_CODE_SUGGESTIONS: 6 # Increase number of code suggestions -``` - ---- - -#### Method 3: Run from source - -1. Clone this repository: - -``` -git clone https://github.com/Codium-ai/pr-agent.git -``` - -2. Install the requirements in your favorite virtual environment: - -``` -pip install -r requirements.txt -``` - -3. Copy the secrets template file and fill in your OpenAI key and your GitHub user token: - -``` -cp pr_agent/settings/.secrets_template.toml pr_agent/settings/.secrets.toml -# Edit .secrets.toml file -``` - -4. Add the pr_agent folder to your PYTHONPATH, then run the cli.py script: - -``` -export PYTHONPATH=[$PYTHONPATH:] -python pr_agent/cli.py --pr_url review -python pr_agent/cli.py --pr_url ask -python pr_agent/cli.py --pr_url describe -python pr_agent/cli.py --pr_url improve -``` - ---- - -#### Method 4: Run as a polling server -Request reviews by tagging your Github user on a PR - -Follow steps 1-3 of method 2. -Run the following command to start the server: - -``` -python pr_agent/servers/github_polling.py -``` - ---- - -#### Method 5: Run as a GitHub App -Allowing you to automate the review process on your private or public repositories. - -1. Create a GitHub App from the [Github Developer Portal](https://docs.github.com/en/developers/apps/creating-a-github-app). - - - Set the following permissions: - - Pull requests: Read & write - - Issue comment: Read & write - - Metadata: Read-only - - Contents: Read-only - - Set the following events: - - Issue comment - - Pull request - -2. Generate a random secret for your app, and save it for later. For example, you can use: - -``` -WEBHOOK_SECRET=$(python -c "import secrets; print(secrets.token_hex(10))") -``` - -3. Acquire the following pieces of information from your app's settings page: - - - App private key (click "Generate a private key" and save the file) - - App ID - -4. Clone this repository: - -``` -git clone https://github.com/Codium-ai/pr-agent.git -``` - -5. Copy the secrets template file and fill in the following: - ``` - cp pr_agent/settings/.secrets_template.toml pr_agent/settings/.secrets.toml - # Edit .secrets.toml file - ``` - - Your OpenAI key. - - Copy your app's private key to the private_key field. - - Copy your app's ID to the app_id field. - - Copy your app's webhook secret to the webhook_secret field. - - Set deployment_type to 'app' in [configuration.toml](./pr_agent/settings/configuration.toml) - -> The .secrets.toml file is not copied to the Docker image by default, and is only used for local development. -> If you want to use the .secrets.toml file in your Docker image, you can add remove it from the .dockerignore file. -> In most production environments, you would inject the secrets file as environment variables or as mounted volumes. -> For example, in order to inject a secrets file as a volume in a Kubernetes environment you can update your pod spec to include the following, -> assuming you have a secret named `pr-agent-settings` with a key named `.secrets.toml`: -``` - volumes: - - name: settings-volume - secret: - secretName: pr-agent-settings -// ... - containers: -// ... - volumeMounts: - - mountPath: /app/pr_agent/settings_prod - name: settings-volume -``` - -> Another option is to set the secrets as environment variables in your deployment environment, for example `OPENAI.KEY` and `GITHUB.USER_TOKEN`. - -6. Build a Docker image for the app and optionally push it to a Docker repository. We'll use Dockerhub as an example: - -``` -docker build . -t codiumai/pr-agent:github_app --target github_app -f docker/Dockerfile -docker push codiumai/pr-agent:github_app # Push to your Docker repository -``` - -7. Host the app using a server, serverless function, or container environment. Alternatively, for development and - debugging, you may use tools like smee.io to forward webhooks to your local machine. - You can check [Deploy as a Lambda Function](#deploy-as-a-lambda-function) - -8. Go back to your app's settings, and set the following: - - - Webhook URL: The URL of your app's server or the URL of the smee.io channel. - - Webhook secret: The secret you generated earlier. - -9. Install the app by navigating to the "Install App" tab and selecting your desired repositories. - ---- - -#### Deploy as a Lambda Function - -1. Follow steps 1-5 of [Method 5](#method-5-run-as-a-github-app). -2. Build a docker image that can be used as a lambda function - ```shell - docker buildx build --platform=linux/amd64 . -t codiumai/pr-agent:serverless -f docker/Dockerfile.lambda - ``` -3. Push image to ECR - ```shell - docker tag codiumai/pr-agent:serverless .dkr.ecr..amazonaws.com/codiumai/pr-agent:serverless - docker push .dkr.ecr..amazonaws.com/codiumai/pr-agent:serverless - ``` -4. Create a lambda function that uses the uploaded image. Set the lambda timeout to be at least 3m. -5. Configure the lambda function to have a Function URL. -6. Go back to steps 8-9 of [Method 5](#method-5-run-as-a-github-app) with the function url as your Webhook URL. - The Webhook URL would look like `https:///api/v1/github_webhooks` diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..ccb173366 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include pr_agent *.toml +recursive-exclude pr_agent *.secrets.toml \ No newline at end of file diff --git a/README.md b/README.md index 0ec00ec21..e29a7cafb 100644 --- a/README.md +++ b/README.md @@ -2,180 +2,315 @@
- -
-Making pull requests less painful with an AI agent + + + + + logo + + +
+CodiumAI PR-Agent aims to help efficiently review and handle pull requests, by providing AI feedback and suggestions
[![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/Codium-ai/pr-agent/blob/main/LICENSE) +[![Static Badge](https://img.shields.io/badge/Chrome-Extension-violet)](https://chromewebstore.google.com/detail/pr-agent-chrome-extension/ephlnjeghhogofkifjloamocljapahnl) +[![Static Badge](https://img.shields.io/badge/Code-Benchmark-blue)](https://pr-agent-docs.codium.ai/finetuning_benchmark/) [![Discord](https://badgen.net/badge/icon/discord?icon=discord&label&color=purple)](https://discord.com/channels/1057273017547378788/1126104260430528613) +[![Twitter](https://img.shields.io/twitter/follow/codiumai)](https://twitter.com/codiumai) +[![Cheat Sheet](https://img.shields.io/badge/Cheat-Sheet-red)](https://www.codium.ai/images/pr_agent/cheat_sheet.pdf) GitHub -
-CodiumAI `PR-Agent` is an open-source tool aiming to help developers review pull requests faster and more efficiently. It automatically analyzes the pull request and can provide several types of feedback: +### [Documentation](https://pr-agent-docs.codium.ai/) +- See the [Installation Guide](https://pr-agent-docs.codium.ai/installation/) for instructions on installing PR-Agent on different platforms. + +- See the [Usage Guide](https://pr-agent-docs.codium.ai/usage-guide/) for instructions on running PR-Agent tools via different interfaces, such as CLI, PR Comments, or by automatically triggering them when a new PR is opened. + +- See the [Tools Guide](https://pr-agent-docs.codium.ai/tools/) for a detailed description of the different tools, and the available configurations for each tool. + + +## Table of Contents +- [News and Updates](#news-and-updates) +- [Overview](#overview) +- [Example results](#example-results) +- [Try it now](#try-it-now) +- [PR-Agent Pro ๐Ÿ’Ž](https://pr-agent-docs.codium.ai/overview/pr_agent_pro/) +- [How it works](#how-it-works) +- [Why use PR-Agent?](#why-use-pr-agent) + +## News and Updates + +### September 21, 2024 +Need help with PR-Agent? New feature - simply comment `/help "your question"` in a pull request, and PR-Agent will provide you with the [relevant documentation](https://github.com/Codium-ai/pr-agent/pull/1241#issuecomment-2365259334). + + + + +### September 12, 2024 +[Dynamic context](https://pr-agent-docs.codium.ai/core-abilities/dynamic_context/) is now the default option for context extension. +This feature enables PR-Agent to dynamically adjusting the relevant context for each code hunk, while avoiding overflowing the model with too much information. + +### September 3, 2024 + +New version of PR-Agent, v0.24 was released. See the [release notes](https://github.com/Codium-ai/pr-agent/releases/tag/v0.24) for more information. + +### August 26, 2024 + +New version of [PR Agent Chrome Extension](https://chromewebstore.google.com/detail/pr-agent-chrome-extension/ephlnjeghhogofkifjloamocljapahnl) was released, with full support of context-aware **PR Chat**. This novel feature is free to use for any open-source repository. See more details in [here](https://pr-agent-docs.codium.ai/chrome-extension/#pr-chat). + + + + + + +### August 11, 2024 +Increased PR context size for improved results, and enabled [asymmetric context](https://github.com/Codium-ai/pr-agent/pull/1114/files#diff-9290a3ad9a86690b31f0450b77acd37ef1914b41fabc8a08682d4da433a77f90R69-R70) + +### August 10, 2024 +Added support for [Azure devops pipeline](https://pr-agent-docs.codium.ai/installation/azure/) - you can now easily run PR-Agent as an Azure devops pipeline, without needing to set up your own server. -**Auto-Description**: Automatically generating PR description - title, type, summary, code walkthrough and PR labels. + +### August 5, 2024 +Added support for [GitLab pipeline](https://pr-agent-docs.codium.ai/installation/gitlab/#run-as-a-gitlab-pipeline) - you can now run easily PR-Agent as a GitLab pipeline, without needing to set up your own server. + +### July 28, 2024 + +(1) improved support for bitbucket server - [auto commands](https://github.com/Codium-ai/pr-agent/pull/1059) and [direct links](https://github.com/Codium-ai/pr-agent/pull/1061) + +(2) custom models are now [supported](https://pr-agent-docs.codium.ai/usage-guide/changing_a_model/#custom-models) + + + +## Overview +
+ +Supported commands per platform: + +| | | GitHub | Gitlab | Bitbucket | Azure DevOps | +|-------|---------------------------------------------------------------------------------------------------------|:--------------------:|:--------------------:|:--------------------:|:------------:| +| TOOLS | Review | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ Incremental | โœ… | | | | +| | โฎ‘ [SOC2 Compliance](https://pr-agent-docs.codium.ai/tools/review/#soc2-ticket-compliance) ๐Ÿ’Ž | โœ… | โœ… | โœ… | | +| | Describe | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ [Inline File Summary](https://pr-agent-docs.codium.ai/tools/describe#inline-file-summary) ๐Ÿ’Ž | โœ… | | | | +| | Improve | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ Extended | โœ… | โœ… | โœ… | โœ… | +| | Ask | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ [Ask on code lines](https://pr-agent-docs.codium.ai/tools/ask#ask-lines) | โœ… | โœ… | | | +| | [Custom Prompt](https://pr-agent-docs.codium.ai/tools/custom_prompt/) ๐Ÿ’Ž | โœ… | โœ… | โœ… | | +| | [Test](https://pr-agent-docs.codium.ai/tools/test/) ๐Ÿ’Ž | โœ… | โœ… | | | +| | Reflect and Review | โœ… | โœ… | โœ… | โœ… | +| | Update CHANGELOG.md | โœ… | โœ… | โœ… | โœ… | +| | Find Similar Issue | โœ… | | | | +| | [Add PR Documentation](https://pr-agent-docs.codium.ai/tools/documentation/) ๐Ÿ’Ž | โœ… | โœ… | | | +| | [Custom Labels](https://pr-agent-docs.codium.ai/tools/custom_labels/) ๐Ÿ’Ž | โœ… | โœ… | | | +| | [Analyze](https://pr-agent-docs.codium.ai/tools/analyze/) ๐Ÿ’Ž | โœ… | โœ… | | | +| | [CI Feedback](https://pr-agent-docs.codium.ai/tools/ci_feedback/) ๐Ÿ’Ž | โœ… | | | | +| | [Similar Code](https://pr-agent-docs.codium.ai/tools/similar_code/) ๐Ÿ’Ž | โœ… | | | | +| | | | | | | +| USAGE | CLI | โœ… | โœ… | โœ… | โœ… | +| | App / webhook | โœ… | โœ… | โœ… | โœ… | +| | Tagging bot | โœ… | | | | +| | Actions | โœ… |โœ…| โœ… |โœ…| +| | | | | | | +| CORE | PR compression | โœ… | โœ… | โœ… | โœ… | +| | Repo language prioritization | โœ… | โœ… | โœ… | โœ… | +| | Adaptive and token-aware file patch fitting | โœ… | โœ… | โœ… | โœ… | +| | Multiple models support | โœ… | โœ… | โœ… | โœ… | +| | [Static code analysis](https://pr-agent-docs.codium.ai/core-abilities/#static-code-analysis) ๐Ÿ’Ž | โœ… | โœ… | โœ… | | +| | [Global and wiki configurations](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/) ๐Ÿ’Ž | โœ… | โœ… | โœ… | | +| | [PR interactive actions](https://www.codium.ai/images/pr_agent/pr-actions.mp4) ๐Ÿ’Ž | โœ… | โœ… | | | +- ๐Ÿ’Ž means this feature is available only in [PR-Agent Pro](https://www.codium.ai/pricing/) + +[//]: # (- Support for additional git providers is described in [here](./docs/Full_environments.md)) +___ + +โ€ฃ **Auto Description ([`/describe`](https://pr-agent-docs.codium.ai/tools/describe/))**: Automatically generating PR description - title, type, summary, code walkthrough and labels. +\ +โ€ฃ **Auto Review ([`/review`](https://pr-agent-docs.codium.ai/tools/review/))**: Adjustable feedback about the PR, possible issues, security concerns, review effort and more. \ -**PR Review**: Adjustable feedback about the PR main theme, type, relevant tests, security issues, focus, score, and various suggestions for the PR content. +โ€ฃ **Code Suggestions ([`/improve`](https://pr-agent-docs.codium.ai/tools/improve/))**: Code suggestions for improving the PR. \ -**Question Answering**: Answering free-text questions about the PR. +โ€ฃ **Question Answering ([`/ask ...`](https://pr-agent-docs.codium.ai/tools/ask/))**: Answering free-text questions about the PR. \ -**Code Suggestions**: Committable code suggestions for improving the PR. +โ€ฃ **Update Changelog ([`/update_changelog`](https://pr-agent-docs.codium.ai/tools/update_changelog/))**: Automatically updating the CHANGELOG.md file with the PR changes. \ -**Update Changelog**: Automatically updating the CHANGELOG.md file with the PR changes. +โ€ฃ **Find Similar Issue ([`/similar_issue`](https://pr-agent-docs.codium.ai/tools/similar_issues/))**: Automatically retrieves and presents similar issues. +\ +โ€ฃ **Add Documentation ๐Ÿ’Ž ([`/add_docs`](https://pr-agent-docs.codium.ai/tools/documentation/))**: Generates documentation to methods/functions/classes that changed in the PR. +\ +โ€ฃ **Generate Custom Labels ๐Ÿ’Ž ([`/generate_labels`](https://pr-agent-docs.codium.ai/tools/custom_labels/))**: Generates custom labels for the PR, based on specific guidelines defined by the user. +\ +โ€ฃ **Analyze ๐Ÿ’Ž ([`/analyze`](https://pr-agent-docs.codium.ai/tools/analyze/))**: Identify code components that changed in the PR, and enables to interactively generate tests, docs, and code suggestions for each component. +\ +โ€ฃ **Custom Prompt ๐Ÿ’Ž ([`/custom_prompt`](https://pr-agent-docs.codium.ai/tools/custom_prompt/))**: Automatically generates custom suggestions for improving the PR code, based on specific guidelines defined by the user. +\ +โ€ฃ **Generate Tests ๐Ÿ’Ž ([`/test component_name`](https://pr-agent-docs.codium.ai/tools/test/))**: Generates unit tests for a selected component, based on the PR code changes. +\ +โ€ฃ **CI Feedback ๐Ÿ’Ž ([`/checks ci_job`](https://pr-agent-docs.codium.ai/tools/ci_feedback/))**: Automatically generates feedback and analysis for a failed CI job. +\ +โ€ฃ **Similar Code ๐Ÿ’Ž ([`/find_similar_component`](https://pr-agent-docs.codium.ai/tools/similar_code/))**: Retrieves the most similar code components from inside the organization's codebase, or from open-source code. +___ -

Example results:

-
-

/describe:

-
-

- -

+## Example results
-

/review:

+

/describe

- +

-

/reflect_and_review:

+
+ +

/review

- + + +

-

/ask:

+
+ +

/improve

- + + +

-

/improve:

+
+ +

/generate_labels

- +

+ +[//]: # (

/reflect_and_review:

) + +[//]: # (
) + +[//]: # (

) + +[//]: # () + +[//]: # (

) + +[//]: # (
) + +[//]: # (

/ask:

) + +[//]: # (
) + +[//]: # (

) + +[//]: # () + +[//]: # (

) + +[//]: # (
) + +[//]: # (

/improve:

) + +[//]: # (
) + +[//]: # (

) + +[//]: # () + +[//]: # (

) + +[//]: # (
)
-- [Overview](#overview) -- [Try it now](#try-it-now) -- [Installation](#installation) -- [Configuration](./CONFIGURATION.md) -- [How it works](#how-it-works) -- [Why use PR-Agent](#why-use-pr-agent) -- [Roadmap](#roadmap) -- [Similar projects](#similar-projects)
+
-## Overview -`PR-Agent` offers extensive pull request functionalities across various git providers: -| | | GitHub | Gitlab | Bitbucket | -|-------|---------------------------------------------|:------:|:------:|:---------:| -| TOOLS | Review | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | โฎ‘ Inline review | :white_check_mark: | :white_check_mark: | | -| | Ask | :white_check_mark: | :white_check_mark: | | -| | Auto-Description | :white_check_mark: | :white_check_mark: | | -| | Improve Code | :white_check_mark: | :white_check_mark: | | -| | Reflect and Review | :white_check_mark: | | | -| | Update CHANGELOG.md | :white_check_mark: | | | -| | | | | | -| USAGE | CLI | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | App / webhook | :white_check_mark: | :white_check_mark: | | -| | Tagging bot | :white_check_mark: | | | -| | Actions | :white_check_mark: | | | -| | | | | | -| CORE | PR compression | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Repo language prioritization | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Adaptive and token-aware
file patch fitting | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Multiple models support | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Incremental PR Review | :white_check_mark: | | | - -Examples for invoking the different tools via the CLI: -- **Review**: python cli.py --pr_url= review -- **Describe**: python cli.py --pr_url= describe -- **Improve**: python cli.py --pr_url= improve -- **Ask**: python cli.py --pr_url= ask "Write me a poem about this PR" -- **Reflect**: python cli.py --pr_url= reflect -- **Update Changelog**: python cli.py --pr_url= update_changelog - -"" is the url of the relevant PR (for example: https://github.com/Codium-ai/pr-agent/pull/50). - -In the [configuration](./CONFIGURATION.md) file you can select your git provider (GitHub, Gitlab, Bitbucket), and further configure the different tools. - ## Try it now -Try GPT-4 powered PR-Agent on your public GitHub repository for free. Just mention `@CodiumAI-Agent` and add the desired command in any PR comment! The agent will generate a response based on your command. +Try the GPT-4 powered PR-Agent instantly on _your public GitHub repository_. Just mention `@CodiumAI-Agent` and add the desired command in any PR comment. The agent will generate a response based on your command. +For example, add a comment to any pull request with the following text: +``` +@CodiumAI-Agent /review +``` +and the agent will respond with a review of your PR. -![Review generation process](https://www.codium.ai/images/demo-2.gif) +Note that this is a promotional bot, suitable only for initial experimentation. +It does not have 'edit' access to your repo, for example, so it cannot update the PR description or add labels (`@CodiumAI-Agent /describe` will publish PR description as a comment). In addition, the bot cannot be used on private repositories, as it does not have access to the files there. -To set up your own PR-Agent, see the [Installation](#installation) section ---- +![Review generation process](https://www.codium.ai/images/demo-2.gif) + -## Installation +To set up your own PR-Agent, see the [Installation](https://pr-agent-docs.codium.ai/installation/) section below. +Note that when you set your own PR-Agent or use CodiumAI hosted PR-Agent, there is no need to mention `@CodiumAI-Agent ...`. Instead, directly start with the command, e.g., `/ask ...`. -To get started with PR-Agent quickly, you first need to acquire two tokens: +--- -1. An OpenAI key from [here](https://platform.openai.com/), with access to GPT-4. -2. A GitHub personal access token (classic) with the repo scope. -There are several ways to use PR-Agent: +## PR-Agent Pro ๐Ÿ’Ž +[PR-Agent Pro](https://www.codium.ai/pricing/) is a hosted version of PR-Agent, provided by CodiumAI. It is available for a monthly fee, and provides the following benefits: +1. **Fully managed** - We take care of everything for you - hosting, models, regular updates, and more. Installation is as simple as signing up and adding the PR-Agent app to your GitHub\GitLab\BitBucket repo. +2. **Improved privacy** - No data will be stored or used to train models. PR-Agent Pro will employ zero data retention, and will use an OpenAI account with zero data retention. +3. **Improved support** - PR-Agent Pro users will receive priority support, and will be able to request new features and capabilities. +4. **Extra features** -In addition to the benefits listed above, PR-Agent Pro will emphasize more customization, and the usage of static code analysis, in addition to LLM logic, to improve results. +See [here](https://pr-agent-docs.codium.ai/#pr-agent-pro) for a list of features available in PR-Agent Pro. -- [Method 1: Use Docker image (no installation required)](INSTALL.md#method-1-use-docker-image-no-installation-required) -- [Method 2: Run as a GitHub Action](INSTALL.md#method-2-run-as-a-github-action) -- [Method 3: Run from source](INSTALL.md#method-3-run-from-source) -- [Method 4: Run as a polling server](INSTALL.md#method-4-run-as-a-polling-server) - - Request reviews by tagging your GitHub user on a PR -- [Method 5: Run as a GitHub App](INSTALL.md#method-5-run-as-a-github-app) - - Allowing you to automate the review process on your private or public repositories ## How it works The following diagram illustrates PR-Agent tools and their flow: -![PR-Agent Tools](https://www.codium.ai/wp-content/uploads/2023/07/codiumai-diagram-v4.jpg) +![PR-Agent Tools](https://codium.ai/images/pr_agent/diagram-v0.9.png) -Check out the [PR Compression strategy](./PR_COMPRESSION.md) page for more details on how we convert a code diff to a manageable LLM prompt +Check out the [PR Compression strategy](https://pr-agent-docs.codium.ai/core-abilities/#pr-compression-strategy) page for more details on how we convert a code diff to a manageable LLM prompt ## Why use PR-Agent? -A reasonable question that can be asked is: `"Why use PR-Agent? What make it stand out from existing tools?"` +A reasonable question that can be asked is: `"Why use PR-Agent? What makes it stand out from existing tools?"` Here are some advantages of PR-Agent: - We emphasize **real-life practical usage**. Each tool (review, improve, ask, ...) has a single GPT-4 call, no more. We feel that this is critical for realistic team usage - obtaining an answer quickly (~30 seconds) and affordably. -- Our [PR Compression strategy](./PR_COMPRESSION.md) is a core ability that enables to effectively tackle both short and long PRs. -- Our JSON prompting strategy enables to have **modular, customizable tools**. For example, the '/review' tool categories can be controlled via the [configuration](./CONFIGURATION.md) file. Adding additional categories is easy and accessible. +- Our [PR Compression strategy](https://pr-agent-docs.codium.ai/core-abilities/#pr-compression-strategy) is a core ability that enables to effectively tackle both short and long PRs. +- Our JSON prompting strategy enables to have **modular, customizable tools**. For example, the '/review' tool categories can be controlled via the [configuration](pr_agent/settings/configuration.toml) file. Adding additional categories is easy and accessible. - We support **multiple git providers** (GitHub, Gitlab, Bitbucket), **multiple ways** to use the tool (CLI, GitHub Action, GitHub App, Docker, ...), and **multiple models** (GPT-4, GPT-3.5, Anthropic, Cohere, Llama2). -- We are open-source, and welcome contributions from the community. - - -## Roadmap - -- [x] Support additional models, as a replacement for OpenAI (see [here](https://github.com/Codium-ai/pr-agent/pull/172)) -- [ ] Develop additional logic for handling large PRs -- [ ] Add additional context to the prompt. For example, repo (or relevant files) summarization, with tools such a [ctags](https://github.com/universal-ctags/ctags) -- [ ] Adding more tools. Possible directions: - - [x] PR description - - [x] Inline code suggestions - - [x] Reflect and review - - [x] Rank the PR (see [here](https://github.com/Codium-ai/pr-agent/pull/89)) - - [ ] Enforcing CONTRIBUTING.md guidelines - - [ ] Performance (are there any performance issues) - - [ ] Documentation (is the PR properly documented) - - [ ] ... - -## Similar Projects - -- [CodiumAI - Meaningful tests for busy devs](https://github.com/Codium-ai/codiumai-vscode-release) -- [Aider - GPT powered coding in your terminal](https://github.com/paul-gauthier/aider) -- [openai-pr-reviewer](https://github.com/coderabbitai/openai-pr-reviewer) -- [CodeReview BOT](https://github.com/anc95/ChatGPT-CodeReview) -- [AI-Maintainer](https://github.com/merwanehamadi/AI-Maintainer) + + +## Data privacy + +### Self-hosted PR-Agent + +- If you host PR-Agent with your OpenAI API key, it is between you and OpenAI. You can read their API data privacy policy here: +https://openai.com/enterprise-privacy + +### CodiumAI-hosted PR-Agent Pro ๐Ÿ’Ž + +- When using PR-Agent Pro ๐Ÿ’Ž, hosted by CodiumAI, we will not store any of your data, nor will we use it for training. You will also benefit from an OpenAI account with zero data retention. + +- For certain clients, CodiumAI-hosted PR-Agent Pro will use CodiumAIโ€™s proprietary models โ€” if this is the case, you will be notified. + +- No passive collection of Code and Pull Requestsโ€™ data โ€” PR-Agent will be active only when you invoke it, and it will then extract and analyze only data relevant to the executed command and queried pull request. + +### PR-Agent Chrome extension + +- The [PR-Agent Chrome extension](https://chromewebstore.google.com/detail/pr-agent-chrome-extension/ephlnjeghhogofkifjloamocljapahnl) serves solely to modify the visual appearance of a GitHub PR screen. It does not transmit any user's repo or pull request code. Code is only sent for processing when a user submits a GitHub comment that activates a PR-Agent tool, in accordance with the standard privacy policy of PR-Agent. + +## Links + +[![Join our Discord community](https://raw.githubusercontent.com/Codium-ai/codiumai-vscode-release/main/media/docs/Joincommunity.png)](https://discord.gg/kG35uSHDBc) + +- Discord community: https://discord.gg/kG35uSHDBc +- CodiumAI site: https://codium.ai +- Blog: https://www.codium.ai/blog/ +- Troubleshooting: https://www.codium.ai/blog/technical-faq-and-troubleshooting/ +- Support: support@codium.ai diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md new file mode 100644 index 000000000..3dab48f92 --- /dev/null +++ b/RELEASE_NOTES.md @@ -0,0 +1,103 @@ +## [Version 0.11] - 2023-12-07 +- codiumai/pr-agent:0.11 +- codiumai/pr-agent:0.11-github_app +- codiumai/pr-agent:0.11-bitbucket-app +- codiumai/pr-agent:0.11-gitlab_webhook +- codiumai/pr-agent:0.11-github_polling +- codiumai/pr-agent:0.11-github_action + +### Added::Algo +- New section in `/describe` tool - [PR changes walkthrough](https://github.com/Codium-ai/pr-agent/pull/509) +- Improving PR Agent [prompts](https://github.com/Codium-ai/pr-agent/pull/501) +- Persistent tools (`/review`, `/describe`) now send an [update message](https://github.com/Codium-ai/pr-agent/pull/499) after finishing +- Add Amazon Bedrock [support](https://github.com/Codium-ai/pr-agent/pull/483) + +### Fixed +- Update [dependencies](https://github.com/Codium-ai/pr-agent/pull/503) in requirements.txt for Python 3.12 + + +## [Version 0.10] - 2023-11-15 +- codiumai/pr-agent:0.10 +- codiumai/pr-agent:0.10-github_app +- codiumai/pr-agent:0.10-bitbucket-app +- codiumai/pr-agent:0.10-gitlab_webhook +- codiumai/pr-agent:0.10-github_polling +- codiumai/pr-agent:0.10-github_action + +### Added::Algo +- Review tool now works with [persistent comments](https://github.com/Codium-ai/pr-agent/pull/451) by default +- Bitbucket now publishes review suggestions with [code links](https://github.com/Codium-ai/pr-agent/pull/428) +- Enabling to limit [max number of tokens](https://github.com/Codium-ai/pr-agent/pull/437/files) +- Support ['gpt-4-1106-preview'](https://github.com/Codium-ai/pr-agent/pull/437/files) model +- Support for Google's [Vertex AI](https://github.com/Codium-ai/pr-agent/pull/436) +- Implementing [thresholds](https://github.com/Codium-ai/pr-agent/pull/423) for incremental PR reviews +- Decoupled custom labels from [PR type](https://github.com/Codium-ai/pr-agent/pull/431) + +### Fixed +- Fixed bug in [parsing quotes](https://github.com/Codium-ai/pr-agent/pull/446) in CLI +- Preserve [user-added labels](https://github.com/Codium-ai/pr-agent/pull/433) in pull requests +- Bug fixes in GitLab and BitBucket + +## [Version 0.9] - 2023-10-29 +- codiumai/pr-agent:0.9 +- codiumai/pr-agent:0.9-github_app +- codiumai/pr-agent:0.9-bitbucket-app +- codiumai/pr-agent:0.9-gitlab_webhook +- codiumai/pr-agent:0.9-github_polling +- codiumai/pr-agent:0.9-github_action + +### Added::Algo +- New tool - [generate_labels](https://github.com/Codium-ai/pr-agent/blob/main/docs/GENERATE_CUSTOM_LABELS.md) +- New ability to use [customize labels](https://github.com/Codium-ai/pr-agent/blob/main/docs/GENERATE_CUSTOM_LABELS.md#how-to-enable-custom-labels) on the `review` and `describe` tools. +- New tool - [add_docs](https://github.com/Codium-ai/pr-agent/blob/main/docs/ADD_DOCUMENTATION.md) +- GitHub Action: Can now use a `.pr_agent.toml` file to control configuration parameters (see [Usage Guide](./Usage.md#working-with-github-action)). +- GitHub App: Added ability to trigger tools on [push events](https://github.com/Codium-ai/pr-agent/blob/main/Usage.md#github-app-automatic-tools-for-new-code-pr-push) +- Support custom domain URLs for Azure devops integration (see [link](https://github.com/Codium-ai/pr-agent/pull/381)). +- PR Description default mode is now in [bullet points](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L35). + +### Added::Documentation +Significant documentation updates (see [Installation Guide](https://github.com/Codium-ai/pr-agent/blob/main/INSTALL.md), [Usage Guide](https://github.com/Codium-ai/pr-agent/blob/main/Usage.md), and [Tools Guide](https://github.com/Codium-ai/pr-agent/blob/main/docs/TOOLS_GUIDE.md)) + +### Fixed +- Fixed support for BitBucket pipeline (see [link](https://github.com/Codium-ai/pr-agent/pull/386)) +- Fixed a bug in `review -i` tool +- Added blacklist for specific file extensions in `add_docs` tool (see [link](https://github.com/Codium-ai/pr-agent/pull/385/)) + +## [Version 0.8] - 2023-09-27 +- codiumai/pr-agent:0.8 +- codiumai/pr-agent:0.8-github_app +- codiumai/pr-agent:0.8-bitbucket-app +- codiumai/pr-agent:0.8-gitlab_webhook +- codiumai/pr-agent:0.8-github_polling +- codiumai/pr-agent:0.8-github_action + +### Added::Algo +- GitHub Action: Can control which tools will run automatically when a new PR is created. (see usage guide: https://github.com/Codium-ai/pr-agent/blob/main/Usage.md#working-with-github-action) +- Code suggestion tool: Will try to avoid an 'add comments' suggestion (see https://github.com/Codium-ai/pr-agent/pull/327) + +### Fixed +- Gitlab: Fixed a bug of improper usage of pr_id + + +## [Version 0.7] - 2023-09-20 + +### Docker Tags +- codiumai/pr-agent:0.7 +- codiumai/pr-agent:0.7-github_app +- codiumai/pr-agent:0.7-bitbucket-app +- codiumai/pr-agent:0.7-gitlab_webhook +- codiumai/pr-agent:0.7-github_polling +- codiumai/pr-agent:0.7-github_action + +### Added::Algo +- New tool /similar_issue - Currently on GitHub app and CLI: indexes the issues in the repo, find the most similar issues to the target issue. +- Describe markers: Empower the /describe tool with a templating capability (see more details in https://github.com/Codium-ai/pr-agent/pull/273). +- New feature in the /review tool - added an estimated effort estimation to the review (https://github.com/Codium-ai/pr-agent/pull/306). + +### Added::Infrastructure +- Implementation of a GitLab webhook. +- Implementation of a BitBucket app. + +### Fixed +- Protection against no code suggestions generated. +- Resilience to repositories where the languages cannot be automatically detected. diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 000000000..52bc77a98 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,5 @@ +comment: false +coverage: + status: + patch: false + project: false diff --git a/docker/Dockerfile b/docker/Dockerfile index 61ab74cf7..07f74cb52 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,24 +1,42 @@ -FROM python:3.10 as base +FROM python:3.12.3 AS base WORKDIR /app +ADD docs/chroma_db.zip /app/docs/chroma_db.zip ADD pyproject.toml . -RUN pip install . && rm pyproject.toml +ADD requirements.txt . +RUN pip install . && rm pyproject.toml requirements.txt ENV PYTHONPATH=/app -FROM base as github_app +FROM base AS github_app ADD pr_agent pr_agent -CMD ["python", "pr_agent/servers/github_app.py"] +CMD ["python", "-m", "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "-c", "pr_agent/servers/gunicorn_config.py", "--forwarded-allow-ips", "*", "pr_agent.servers.github_app:app"] -FROM base as github_polling +FROM base AS bitbucket_app +ADD pr_agent pr_agent +CMD ["python", "pr_agent/servers/bitbucket_app.py"] + +FROM base AS bitbucket_server_webhook +ADD pr_agent pr_agent +CMD ["python", "pr_agent/servers/bitbucket_server_webhook.py"] + +FROM base AS github_polling ADD pr_agent pr_agent CMD ["python", "pr_agent/servers/github_polling.py"] -FROM base as test +FROM base AS gitlab_webhook +ADD pr_agent pr_agent +CMD ["python", "pr_agent/servers/gitlab_webhook.py"] + +FROM base AS azure_devops_webhook +ADD pr_agent pr_agent +CMD ["python", "pr_agent/servers/azuredevops_server_webhook.py"] + +FROM base AS test ADD requirements-dev.txt . RUN pip install -r requirements-dev.txt && rm requirements-dev.txt ADD pr_agent pr_agent ADD tests tests -FROM base as cli +FROM base AS cli ADD pr_agent pr_agent ENTRYPOINT ["python", "pr_agent/cli.py"] diff --git a/docker/Dockerfile.lambda b/docker/Dockerfile.lambda index 59e78a54a..54aa13740 100644 --- a/docker/Dockerfile.lambda +++ b/docker/Dockerfile.lambda @@ -1,10 +1,10 @@ FROM public.ecr.aws/lambda/python:3.10 RUN yum update -y && \ - yum install -y gcc python3-devel && \ + yum install -y gcc python3-devel git && \ yum clean all -ADD pyproject.toml . +ADD pyproject.toml requirements.txt . RUN pip install . && rm pyproject.toml RUN pip install mangum==0.17.0 COPY pr_agent/ ${LAMBDA_TASK_ROOT}/pr_agent/ diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..42154c966 --- /dev/null +++ b/docs/README.md @@ -0,0 +1 @@ +# [Visit Our Docs Portal](https://pr-agent-docs.codium.ai/) diff --git a/docs/chroma_db.zip b/docs/chroma_db.zip new file mode 100644 index 000000000..2726a919a Binary files /dev/null and b/docs/chroma_db.zip differ diff --git a/docs/docs/CNAME b/docs/docs/CNAME new file mode 100644 index 000000000..1cbf97403 --- /dev/null +++ b/docs/docs/CNAME @@ -0,0 +1 @@ +pr-agent-docs.codium.ai diff --git a/docs/docs/assets/favicon.ico b/docs/docs/assets/favicon.ico new file mode 100644 index 000000000..fece6b1e8 Binary files /dev/null and b/docs/docs/assets/favicon.ico differ diff --git a/docs/docs/assets/logo.png b/docs/docs/assets/logo.png new file mode 100644 index 000000000..4c9fec1de Binary files /dev/null and b/docs/docs/assets/logo.png differ diff --git a/docs/docs/assets/logo.svg b/docs/docs/assets/logo.svg new file mode 100644 index 000000000..5de226775 --- /dev/null +++ b/docs/docs/assets/logo.svg @@ -0,0 +1,140 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/docs/chrome-extension/data_privacy.md b/docs/docs/chrome-extension/data_privacy.md new file mode 100644 index 000000000..37052e25d --- /dev/null +++ b/docs/docs/chrome-extension/data_privacy.md @@ -0,0 +1,5 @@ +We take your code's security and privacy seriously: + +- The Chrome extension will not send your code to any external servers. +- For private repositories, we will first validate the user's identity and permissions. After authentication, we generate responses using the existing PR-Agent Pro integration. + diff --git a/docs/docs/chrome-extension/features.md b/docs/docs/chrome-extension/features.md new file mode 100644 index 000000000..3db586577 --- /dev/null +++ b/docs/docs/chrome-extension/features.md @@ -0,0 +1,51 @@ + +### PR chat + +The PR-Chat feature allows to freely chat with your PR code, within your GitHub environment. +It will seamlessly use the PR as context to your chat session, and provide AI-powered feedback. + +To enable private chat, simply install the PR-Agent Chrome extension. After installation, each PR's file-changed tab will include a chat box, where you may ask questions about your code. +This chat session is **private**, and won't be visible to other users. + +All open-source repositories are supported. +For private repositories, you will also need to install PR-Agent Pro, After installation, make sure to open at least one new PR to fully register your organization. Once done, you can chat with both new and existing PRs across all installed repositories. + +#### Context-aware PR chat + +PR-Agent constructs a comprehensive context for each pull request, incorporating the PR description, commit messages, and code changes with extended dynamic context. This contextual information, along with additional PR-related data, forms the foundation for an AI-powered chat session. The agent then leverages this rich context to provide intelligent, tailored responses to user inquiries about the pull request. + + + + + +### Toolbar extension +With PR-Agent Chrome extension, it's [easier than ever](https://www.youtube.com/watch?v=gT5tli7X4H4) to interactively configure and experiment with the different tools and configuration options. + +For private repositories, after you found the setup that works for you, you can also easily export it as a persistent configuration file, and use it for automatic commands. + + + + + +### PR-Agent filters + +PR-Agent filters is a sidepanel option. that allows you to filter different message in the conversation tab. + +For example, you can choose to present only message from PR-Agent, or filter those messages, focusing only on user's comments. + + + + + + +### Enhanced code suggestions + +PR-Agent Chrome extension adds the following capabilities to code suggestions tool's comments: + +- Auto-expand the table when you are viewing a code block, to avoid clipping. +- Adding a "quote-and-reply" button, that enables to address and comment on a specific suggestion (for example, asking the author to fix the issue) + + + + + diff --git a/docs/docs/chrome-extension/index.md b/docs/docs/chrome-extension/index.md new file mode 100644 index 000000000..1261f5ae1 --- /dev/null +++ b/docs/docs/chrome-extension/index.md @@ -0,0 +1,14 @@ +[PR-Agent Chrome extension](https://chromewebstore.google.com/detail/pr-agent-chrome-extension/ephlnjeghhogofkifjloamocljapahnl) is a collection of tools that integrates seamlessly with your GitHub environment, aiming to enhance your Git usage experience, and providing AI-powered capabilities to your PRs. + +With a single-click installation you will gain access to a context-aware chat on your pull requests code, a toolbar extension with multiple AI feedbacks, PR-Agent filters, and additional abilities. + +The extension is powered by top code models like Claude 3.5 Sonnet and GPT4. All the extension's features are free to use on public repositories. + +For private repositories, you will need to install [PR-Agent Pro](https://github.com/apps/codiumai-pr-agent-pro) in addition to the extension (Quick GitHub app setup with a 14-day free trial. No credit card needed). +For a demonstration of how to install PR-Agent Pro and use it with the Chrome extension, please refer to the tutorial video at the provided [link](https://codium.ai/images/pr_agent/private_repos.mp4). + + + +### Supported browsers + +The extension is supported on all Chromium-based browsers, including Google Chrome, Arc, Opera, Brave, and Microsoft Edge. diff --git a/docs/docs/core-abilities/code_oriented_yaml.md b/docs/docs/core-abilities/code_oriented_yaml.md new file mode 100644 index 000000000..32cfee7ff --- /dev/null +++ b/docs/docs/core-abilities/code_oriented_yaml.md @@ -0,0 +1,2 @@ +## Overview +TBD \ No newline at end of file diff --git a/PR_COMPRESSION.md b/docs/docs/core-abilities/compression_strategy.md similarity index 76% rename from PR_COMPRESSION.md rename to docs/docs/core-abilities/compression_strategy.md index 8e3e5fd7f..c09de0dbf 100644 --- a/PR_COMPRESSION.md +++ b/docs/docs/core-abilities/compression_strategy.md @@ -1,42 +1,47 @@ -# Git Patch Logic + +## Overview - PR Compression Strategy There are two scenarios: + 1. The PR is small enough to fit in a single prompt (including system and user prompt) 2. The PR is too large to fit in a single prompt (including system and user prompt) For both scenarios, we first use the following strategy -#### Repo language prioritization strategy +#### Repo language prioritization strategy We prioritize the languages of the repo based on the following criteria: + 1. Exclude binary files and non code files (e.g. images, pdfs, etc) 2. Given the main languages used in the repo -2. We sort the PR files by the most common languages in the repo (in descending order): +3. We sort the PR files by the most common languages in the repo (in descending order): * ```[[file.py, file2.py],[file3.js, file4.jsx],[readme.md]]``` -## Small PR +### Small PR In this case, we can fit the entire PR in a single prompt: 1. Exclude binary files and non code files (e.g. images, pdfs, etc) -2. We Expand the surrounding context of each patch to 6 lines above and below the patch -## Large PR +2. We Expand the surrounding context of each patch to 3 lines above and below the patch + +### Large PR -### Motivation +#### Motivation Pull Requests can be very long and contain a lot of information with varying degree of relevance to the pr-agent. We want to be able to pack as much information as possible in a single LMM prompt, while keeping the information relevant to the pr-agent. - - -#### PR compression strategy +#### Compression strategy We prioritize additions over deletions: - Combine all deleted files into a single list (`deleted files`) - File patches are a list of hunks, remove all hunks of type deletion-only from the hunks in the file patch + #### Adaptive and token-aware file patch fitting We use [tiktoken](https://github.com/openai/tiktoken) to tokenize the patches after the modifications described above, and we use the following strategy to fit the patches into the prompt: + 1. Within each language we sort the files by the number of tokens in the file (in descending order): - * ```[[file2.py, file.py],[file4.jsx, file3.js],[readme.md]]``` + - ```[[file2.py, file.py],[file4.jsx, file3.js],[readme.md]]``` 2. Iterate through the patches in the order described above -2. Add the patches to the prompt until the prompt reaches a certain buffer from the max token length -3. If there are still patches left, add the remaining patches as a list called `other modified files` to the prompt until the prompt reaches the max token length (hard stop), skip the rest of the patches. -4. If we haven't reached the max token length, add the `deleted files` to the prompt until the prompt reaches the max token length (hard stop), skip the rest of the patches. +3. Add the patches to the prompt until the prompt reaches a certain buffer from the max token length +4. If there are still patches left, add the remaining patches as a list called `other modified files` to the prompt until the prompt reaches the max token length (hard stop), skip the rest of the patches. +5. If we haven't reached the max token length, add the `deleted files` to the prompt until the prompt reaches the max token length (hard stop), skip the rest of the patches. + +#### Example -### Example -![](https://codium.ai/images/git_patch_logic.png) +![Core Abilities](https://codium.ai/images/git_patch_logic.png){width=768} diff --git a/docs/docs/core-abilities/dynamic_context.md b/docs/docs/core-abilities/dynamic_context.md new file mode 100644 index 000000000..7f07d222a --- /dev/null +++ b/docs/docs/core-abilities/dynamic_context.md @@ -0,0 +1,72 @@ +## TL;DR + +PR-Agent uses an **asymmetric and dynamic context strategy** to improve AI analysis of code changes in pull requests. +It provides more context before changes than after, and dynamically adjusts the context based on code structure (e.g., enclosing functions or classes). +This approach balances providing sufficient context for accurate analysis, while avoiding needle-in-the-haystack information overload that could degrade AI performance or exceed token limits. + +## Introduction + +Pull request code changes are retrieved in a unified diff format, showing three lines of context before and after each modified section, with additions marked by '+' and deletions by '-'. +``` +@@ -12,5 +12,5 @@ def func1(): + code line that already existed in the file... + code line that already existed in the file... + code line that already existed in the file.... +-code line that was removed in the PR ++new code line added in the PR + code line that already existed in the file... + code line that already existed in the file... + code line that already existed in the file... + +@@ -26,2 +26,4 @@ def func2(): +... +``` + +This unified diff format can be challenging for AI models to interpret accurately, as it provides limited context for understanding the full scope of code changes. +The presentation of code using '+', '-', and ' ' symbols to indicate additions, deletions, and unchanged lines respectively also differs from the standard code formatting typically used to train AI models. + + +## Challenges of expanding the context window + +While expanding the context window is technically feasible, it presents a more fundamental trade-off: + +Pros: + +- Enhanced context allows the model to better comprehend and localize the code changes, results (potentially) in more precise analysis and suggestions. Without enough context, the model may struggle to understand the code changes and provide relevant feedback. + +Cons: + +- Excessive context may overwhelm the model with extraneous information, creating a "needle in a haystack" scenario where focusing on the relevant details (the code that actually changed) becomes challenging. +LLM quality is known to degrade when the context gets larger. +Pull requests often encompass multiple changes across many files, potentially spanning hundreds of lines of modified code. This complexity presents a genuine risk of overwhelming the model with excessive context. + +- Increased context expands the token count, increasing processing time and cost, and may prevent the model from processing the entire pull request in a single pass. + +## Asymmetric and dynamic context +To address these challenges, PR-Agent employs an **asymmetric** and **dynamic** context strategy, providing the model with more focused and relevant context information for each code change. + +**Asymmetric:** + +We start by recognizing that the context preceding a code change is typically more crucial for understanding the modification than the context following it. +Consequently, PR-Agent implements an asymmetric context policy, decoupling the context window into two distinct segments: one for the code before the change and another for the code after. + +By independently adjusting each context window, PR-Agent can supply the model with a more tailored and pertinent context for individual code changes. + +**Dynamic:** + +We also employ a "dynamic" context strategy. +We start by recognizing that the optimal context for a code change often corresponds to its enclosing code component (e.g., function, class), rather than a fixed number of lines. +Consequently, we dynamically adjust the context window based on the code's structure, ensuring the model receives the most pertinent information for each modification. + +To prevent overwhelming the model with excessive context, we impose a limit on the number of lines searched when identifying the enclosing component. +This balance allows for comprehensive understanding while maintaining efficiency and limiting context token usage. + +## Appendix - relevant configuration options +``` +[config] +patch_extension_skip_types =[".md",".txt"] # Skip files with these extensions when trying to extend the context +allow_dynamic_context=true # Allow dynamic context extension +max_extra_lines_before_dynamic_context = 8 # will try to include up to X extra lines before the hunk in the patch, until we reach an enclosing function or class +patch_extra_lines_before = 3 # Number of extra lines (+3 default ones) to include before each hunk in the patch +patch_extra_lines_after = 1 # Number of extra lines (+3 default ones) to include after each hunk in the patch +``` \ No newline at end of file diff --git a/docs/docs/core-abilities/impact_evaluation.md b/docs/docs/core-abilities/impact_evaluation.md new file mode 100644 index 000000000..327482f68 --- /dev/null +++ b/docs/docs/core-abilities/impact_evaluation.md @@ -0,0 +1,44 @@ +# Overview - Impact Evaluation ๐Ÿ’Ž + +Demonstrating the return on investment (ROI) of AI-powered initiatives is crucial for modern organizations. +To address this need, PR-Agent has developed an AI impact measurement tools and metrics, providing advanced analytics to help businesses quantify the tangible benefits of AI adoption in their PR review process. + + +## Auto Impact Validator - Real-Time Tracking of Implemented PR-Agent Suggestions + +### How It Works +When a user pushes a new commit to the pull request, PR-Agent automatically compares the updated code against the previous suggestions, marking them as implemented if the changes address these recommendations, whether directly or indirectly: + +1. **Direct Implementation:** The user directly addresses the suggestion as-is in the PR, either by clicking on the "apply code suggestion" checkbox or by making the changes manually. +2. **Indirect Implementation:** PR-Agent recognizes when a suggestion's intent is fulfilled, even if the exact code changes differ from the original recommendation. It marks these suggestions as implemented, acknowledging that users may achieve the same goal through alternative solutions. + +### Real-Time Visual Feedback +Upon confirming that a suggestion was implemented, PR-Agent automatically adds a โœ… (check mark) to the relevant suggestion, enabling transparent tracking of PR-Agent's impact analysis. +PR-Agent will also add, inside the relevant suggestions, an explanation of how the new code was impacted by each suggestion. + +![Suggestion_checkmark](https://codium.ai/images/pr_agent/auto_suggestion_checkmark.png){width=512} + +### Dashboard Metrics +The dashboard provides macro-level insights into the overall impact of PR-Agent on the pull-request process with key productivity metrics. + +By offering clear, data-driven evidence of PR-Agent's impact, it empowers leadership teams to make informed decisions about the tool's effectiveness and ROI. + +Here are key metrics that the dashboard tracks: + +#### PR-Agent Impacts per 1K Lines +![Dashboard](https://codium.ai/images/pr_agent/impacts_per_1k_llines.png){width=512} +> Explanation: for every 1K lines of code (additions/edits), PR-Agent had on average ~X suggestions implemented. + +**Why This Metric Matters:** + +1. **Standardized and Comparable Measurement:** By measuring impacts per 1K lines of code additions, you create a standardized metric that can be compared across different projects, teams, customers, and time periods. This standardization is crucial for meaningful analysis, benchmarking, and identifying where PR-Agent is most effective. +2. **Accounts for PR Variability and Incentivizes Quality:** This metric addresses the fact that "Not all PRs are created equal." By normalizing against lines of code rather than PR count, you account for the variability in PR sizes and focus on the quality and impact of suggestions rather than just the number of PRs affected. +3. **Quantifies Value and ROI:** The metric directly correlates with the value PR-Agent is providing, showing how frequently it offers improvements relative to the amount of new code being written. This provides a clear, quantifiable way to demonstrate PR-Agent's return on investment to stakeholders. + +#### Suggestion Effectiveness Across Categories +![Impacted_Suggestion_Score](https://codium.ai/images/pr_agent/impact_by_category.png){width=512} +> Explanation: This chart illustrates the distribution of implemented suggestions across different categories, enabling teams to better understand PR-Agent's impact on various aspects of code quality and development practices. + +#### Suggestion Score Distribution +![Impacted_Suggestion_Score](https://codium.ai/images/pr_agent/impacted_score_dist.png){width=512} +> Explanation: The distribution of the suggestion score for the implemented suggestions, ensuring that higher-scored suggestions truly represent more significant improvements. diff --git a/docs/docs/core-abilities/index.md b/docs/docs/core-abilities/index.md new file mode 100644 index 000000000..7d9831449 --- /dev/null +++ b/docs/docs/core-abilities/index.md @@ -0,0 +1,12 @@ +# Core Abilities +PR-Agent utilizes a variety of core abilities to provide a comprehensive and efficient code review experience. These abilities include: + +- [Local and global metadata](https://pr-agent-docs.codium.ai/core-abilities/metadata/) +- [Dynamic context](https://pr-agent-docs.codium.ai/core-abilities/dynamic_context/) +- [Self-reflection](https://pr-agent-docs.codium.ai/core-abilities/self_reflection/) +- [Impact evaluation](https://pr-agent-docs.codium.ai/core-abilities/impact_evaluation/) +- [Interactivity](https://pr-agent-docs.codium.ai/core-abilities/interactivity/) +- [Compression strategy](https://pr-agent-docs.codium.ai/core-abilities/compression_strategy/) +- [Code-oriented YAML](https://pr-agent-docs.codium.ai/core-abilities/code_oriented_yaml/) +- [Static code analysis](https://pr-agent-docs.codium.ai/core-abilities/static_code_analysis/) +- [Code fine-tuning benchmark](https://pr-agent-docs.codium.ai/finetuning_benchmark/) \ No newline at end of file diff --git a/docs/docs/core-abilities/interactivity.md b/docs/docs/core-abilities/interactivity.md new file mode 100644 index 000000000..e484d641d --- /dev/null +++ b/docs/docs/core-abilities/interactivity.md @@ -0,0 +1,2 @@ +## Interactive invocation ๐Ÿ’Ž +TBD \ No newline at end of file diff --git a/docs/docs/core-abilities/metadata.md b/docs/docs/core-abilities/metadata.md new file mode 100644 index 000000000..2ad5609a1 --- /dev/null +++ b/docs/docs/core-abilities/metadata.md @@ -0,0 +1,56 @@ +## Local and global metadata injection with multi-stage analysis +(1) +PR-Agent initially retrieves for each PR the following data: + +- PR title and branch name +- PR original description +- Commit messages history +- PR diff patches, in [hunk diff](https://loicpefferkorn.net/2014/02/diff-files-what-are-hunks-and-how-to-extract-them/) format +- The entire content of the files that were modified in the PR + +!!! tip "Tip: Organization-level metadata" + In addition to the inputs above, PR-Agent can incorporate supplementary preferences provided by the user, like [`extra_instructions` and `organization best practices`](https://pr-agent-docs.codium.ai/tools/improve/#extra-instructions-and-best-practices). This information can be used to enhance the PR analysis. + +(2) +By default, the first command that PR-Agent executes is [`describe`](https://pr-agent-docs.codium.ai/tools/describe/), which generates three types of outputs: + +- PR Type (e.g. bug fix, feature, refactor, etc) +- PR Description - a bullet point summary of the PR +- Changes walkthrough - for each modified file, provide a one-line summary followed by a detailed bullet point list of the changes. + +These AI-generated outputs are now considered as part of the PR metadata, and can be used in subsequent commands like `review` and `improve`. +This effectively enables multi-stage chain-of-thought analysis, without doing any additional API calls which will cost time and money. + +For example, when generating code suggestions for different files, PR-Agent can inject the AI-generated ["Changes walkthrough"](https://github.com/Codium-ai/pr-agent/pull/1202#issue-2511546839) file summary in the prompt: + +``` +## File: 'src/file1.py' +### AI-generated file summary: +- edited function `func1` that does X +- Removed function `func2` that was not used +- .... + +@@ ... @@ def func1(): +__new hunk__ +11 unchanged code line0 in the PR +12 unchanged code line1 in the PR +13 +new code line2 added in the PR +14 unchanged code line3 in the PR +__old hunk__ + unchanged code line0 + unchanged code line1 +-old code line2 removed in the PR + unchanged code line3 + +@@ ... @@ def func2(): +__new hunk__ +... +__old hunk__ +... +``` + +(3) The entire PR files that were retrieved are also used to expand and enhance the PR context (see [Dynamic Context](https://pr-agent-docs.codium.ai/core-abilities/dynamic-context/)). + + +(4) All the metadata described above represents several level of cumulative analysis - ranging from hunk level, to file level, to PR level, to organization level. +This comprehensive approach enables PR-Agent AI models to generate more precise and contextually relevant suggestions and feedback. \ No newline at end of file diff --git a/docs/docs/core-abilities/self_reflection.md b/docs/docs/core-abilities/self_reflection.md new file mode 100644 index 000000000..f58a730d8 --- /dev/null +++ b/docs/docs/core-abilities/self_reflection.md @@ -0,0 +1,51 @@ +## TL;DR + +PR-Agent implements a **self-reflection** process where the AI model reflects, scores, and re-ranks its own suggestions, eliminating irrelevant or incorrect ones. +This approach improves the quality and relevance of suggestions, saving users time and enhancing their experience. +Configuration options allow users to set a score threshold for further filtering out suggestions. + +## Introduction - Efficient Review with Hierarchical Presentation + + +Given that not all generated code suggestions will be relevant, it is crucial to enable users to review them in a fast and efficient way, allowing quick identification and filtering of non-applicable ones. + +To achieve this goal, PR-Agent offers a dedicated hierarchical structure when presenting suggestions to users: + +- A "category" section groups suggestions by their category, allowing users to quickly dismiss irrelevant suggestions. +- Each suggestion is first described by a one-line summary, which can be expanded to a full description by clicking on a collapsible. +- Upon expanding a suggestion, the user receives a more comprehensive description, and a code snippet demonstrating the recommendation. + +!!! note "Fast Review" + This hierarchical structure is designed to facilitate rapid review of each suggestion, with users spending an average of ~5-10 seconds per item. + +## Self-reflection and Re-ranking + +The AI model is initially tasked with generating suggestions, and outputting them in order of importance. +However, in practice we observe that models often struggle to simultaneously generate high-quality code suggestions and rank them well in a single pass. +Furthermore, the initial set of generated suggestions sometimes contains easily identifiable errors. + +To address these issues, we implemented a "self-reflection" process that refines suggestion ranking and eliminates irrelevant or incorrect proposals. +This process consists of the following steps: + +1. Presenting the generated suggestions to the model in a follow-up call. +2. Instructing the model to score each suggestion on a scale of 0-10 and provide a rationale for the assigned score. +3. Utilizing these scores to re-rank the suggestions and filter out incorrect ones (with a score of 0). +4. Optionally, filtering out all suggestions below a user-defined score threshold. + +Note that presenting all generated suggestions simultaneously provides the model with a comprehensive context, enabling it to make more informed decisions compared to evaluating each suggestion individually. + +To conclude, the self-reflection process enables PR-Agent to prioritize suggestions based on their importance, eliminate inaccurate or irrelevant proposals, and optionally exclude suggestions that fall below a specified threshold of significance. +This results in a more refined and valuable set of suggestions for the user, saving time and improving the overall experience. + +## Example Results + +![self_reflection](https://codium.ai/images/pr_agent/self_reflection1.png){width=768} +![self_reflection](https://codium.ai/images/pr_agent/self_reflection2.png){width=768} + + +## Appendix - Relevant Configuration Options +``` +[pr_code_suggestions] +self_reflect_on_suggestions = true # Enable self-reflection on code suggestions +suggestions_score_threshold = 0 # Filter out suggestions with a score below this threshold (0-10) +``` \ No newline at end of file diff --git a/docs/docs/core-abilities/static_code_analysis.md b/docs/docs/core-abilities/static_code_analysis.md new file mode 100644 index 000000000..b33c2c5d0 --- /dev/null +++ b/docs/docs/core-abilities/static_code_analysis.md @@ -0,0 +1,70 @@ +## Overview - Static Code Analysis ๐Ÿ’Ž + +By combining static code analysis with LLM capabilities, PR-Agent can provide a comprehensive analysis of the PR code changes on a component level. + +It scans the PR code changes, finds all the code components (methods, functions, classes) that changed, and enables to interactively generate tests, docs, code suggestions and similar code search for each component. + +!!! note "Language that are currently supported:" + Python, Java, C++, JavaScript, TypeScript, C#. + + +## Capabilities + +### Analyze PR + + +The [`analyze`](https://pr-agent-docs.codium.ai/tools/analyze/) tool enables to interactively generate tests, docs, code suggestions and similar code search for each component that changed in the PR. +It can be invoked manually by commenting on any PR: +``` +/analyze +``` + +An example result: + +![Analyze 1](https://codium.ai/images/pr_agent/analyze_1.png){width=768} + +Clicking on each checkbox will trigger the relevant tool for the selected component. + +### Generate Tests + +The [`test`](https://pr-agent-docs.codium.ai/tools/test/) tool generate tests for a selected component, based on the PR code changes. +It can be invoked manually by commenting on any PR: +``` +/test component_name +``` +where 'component_name' is the name of a specific component in the PR, Or be triggered interactively by using the `analyze` tool. + +![test1](https://codium.ai/images/pr_agent/test1.png){width=768} + +### Generate Docs for a Component + +The [`add_docs`](https://pr-agent-docs.codium.ai/tools/documentation/) tool scans the PR code changes, and automatically generate docstrings for any code components that changed in the PR. +It can be invoked manually by commenting on any PR: +``` +/add_docs component_name +``` + +Or be triggered interactively by using the `analyze` tool. + +![Docs single component](https://codium.ai/images/pr_agent/docs_single_component.png){width=768} + +### Generate Code Suggestions for a Component +The [`improve_component`](https://pr-agent-docs.codium.ai/tools/improve_component/) tool generates code suggestions for a specific code component that changed in the PR. +It can be invoked manually by commenting on any PR: +``` +/improve_component component_name +``` + +Or be triggered interactively by using the `analyze` tool. + +![improve_component2](https://codium.ai/images/pr_agent/improve_component2.png){width=768} + +### Find Similar Code + +The [`similar code`](https://pr-agent-docs.codium.ai/tools/similar_code/) tool retrieves the most similar code components from inside the organization's codebase, or from open-source code. + +For example: + +`Global Search` for a method called `chat_completion`: + +![similar code global](https://codium.ai/images/pr_agent/similar_code_global2.png){width=768} diff --git a/docs/docs/css/custom.css b/docs/docs/css/custom.css new file mode 100644 index 000000000..356283186 --- /dev/null +++ b/docs/docs/css/custom.css @@ -0,0 +1,49 @@ + + +:root { + --md-primary-fg-color: #765bfa; + --md-accent-fg-color: #AEA1F1; + } + +.md-nav--primary { + .md-nav__link { + font-size: 18px; /* Change the font size as per your preference */ + } +} + +.md-nav--primary { + position: relative; /* Ensure the element is positioned */ +} + +.md-nav--primary::before { + content: ""; + position: absolute; + top: 0; + right: 10px; /* Move the border 10 pixels to the right */ + width: 2px; + height: 100%; + background-color: #f5f5f5; /* Match the border color */ +} +/*.md-nav__title, .md-nav__link {*/ +/* font-size: 18px;*/ +/* margin-top: 14px; !* Adjust the space as needed *!*/ +/* margin-bottom: 14px; !* Adjust the space as needed *!*/ +/*}*/ + +.md-tabs__link { + font-size: 18px; +} + +.md-header__title { + font-size: 20px; + margin-left: 0px !important; +} + +.md-content img { + border-width: 1px; + border-style: solid; + border-color: black; + outline-width: 1px; + outline-style: solid; + outline-color: darkgray; + } diff --git a/docs/docs/faq/index.md b/docs/docs/faq/index.md new file mode 100644 index 000000000..7536492e2 --- /dev/null +++ b/docs/docs/faq/index.md @@ -0,0 +1,67 @@ +# FAQ + +??? note "Question: Can PR-Agent serve as a substitute for a human reviewer?" + #### Answer:1 + + PR-Agent is designed to assist, not replace, human reviewers. + + Reviewing PRs is a tedious and time-consuming task often seen as a "chore". In addition, the longer the PR โ€“ the shorter the relative feedback, since long PRs can overwhelm reviewers, both in terms of technical difficulty, and the actual review time. + PR-Agent aims to address these pain points, and to assist and empower both the PR author and reviewer. + + However, PR-Agent has built-in safeguards to ensure the developer remains in the driver's seat. For example: + + 1. Preserves user's original PR header + 2. Places user's description above the AI-generated PR description + 3. Cannot approve PRs; approval remains reviewer's responsibility + 4. The code suggestions are optional, and aim to: + - Encourage self-review and self-reflection + - Highlight potential bugs or oversights + - Enhance code quality and promote best practices + + Read more about this issue in our [blog](https://www.codium.ai/blog/understanding-the-challenges-and-pain-points-of-the-pull-request-cycle/) + +___ + +??? note "Question: I received an incorrect or irrelevant suggestion. Why?" + + #### Answer:2 + + - Modern AI models, like Claude 3.5 Sonnet and GPT-4, are improving rapidly but remain imperfect. Users should critically evaluate all suggestions rather than accepting them automatically. + - AI errors are rare, but possible. A main value from reviewing the code suggestions lies in their high probability of catching **mistakes or bugs made by the PR author**. We believe it's worth spending 30-60 seconds reviewing suggestions, even if some aren't relevant, as this practice can enhances code quality and prevent bugs in production. + + + - The hierarchical structure of the suggestions is designed to help the user to _quickly_ understand them, and to decide which ones are relevant and which are not: + + - Only if the `Category` header is relevant, the user should move to the summarized suggestion description. + - Only if the summarized suggestion description is relevant, the user should click on the collapsible, to read the full suggestion description with a code preview example. + + - In addition, we recommend to use the [`extra_instructions`](https://pr-agent-docs.codium.ai/tools/improve/#extra-instructions-and-best-practices) field to guide the model to suggestions that are more relevant to the specific needs of the project. + - The interactive [PR chat](https://pr-agent-docs.codium.ai/chrome-extension/) also provides an easy way to get more tailored suggestions and feedback from the AI model. + +___ + +??? note "Question: How can I get more tailored suggestions?" + #### Answer:3 + + See [here](https://pr-agent-docs.codium.ai/tools/improve/#extra-instructions-and-best-practices) for more information on how to use the `extra_instructions` and `best_practices` configuration options, to guide the model to more tailored suggestions. + +___ + +??? note "Question: Will you store my code ? Are you using my code to train models?" + #### Answer:4 + + No. PR-Agent strict privacy policy ensures that your code is not stored or used for training purposes. + + For a detailed overview of our data privacy policy, please refer to [this link](https://pr-agent-docs.codium.ai/overview/data_privacy/) + +___ + +??? note "Question: Can I use my own LLM keys with PR-Agent?" + #### Answer:5 + + When you self-host, you use your own keys. + + PR-Agent Pro with SaaS deployment is a hosted version of PR-Agent, where Codium AI manages the infrastructure and the keys. + For enterprise customers, on-prem deployment is also available. [Contact us](https://www.codium.ai/contact/#pricing) for more information. + +___ \ No newline at end of file diff --git a/docs/docs/finetuning_benchmark/index.md b/docs/docs/finetuning_benchmark/index.md new file mode 100644 index 000000000..d487853b5 --- /dev/null +++ b/docs/docs/finetuning_benchmark/index.md @@ -0,0 +1,93 @@ +# PR-Agent Code Fine-tuning Benchmark + +On coding tasks, the gap between open-source models and top closed-source models such as GPT4 is significant. +
+In practice, open-source models are unsuitable for most real-world code tasks, and require further fine-tuning to produce acceptable results. + +_PR-Agent fine-tuning benchmark_ aims to benchmark open-source models on their ability to be fine-tuned for a coding task. +Specifically, we chose to fine-tune open-source models on the task of analyzing a pull request, and providing useful feedback and code suggestions. + +Here are the results: +
+
+ +**Model performance:** + +| Model name | Model size [B] | Better than gpt-4 rate, after fine-tuning [%] | +|-----------------------------|----------------|----------------------------------------------| +| **DeepSeek 34B-instruct** | **34** | **40.7** | +| DeepSeek 34B-base | 34 | 38.2 | +| Phind-34b | 34 | 38 | +| Granite-34B | 34 | 37.6 | +| Codestral-22B-v0.1 | 22 | 32.7 | +| QWEN-1.5-32B | 32 | 29 | +| | | | +| **CodeQwen1.5-7B** | **7** | **35.4** | +| Llama-3.1-8B-Instruct | 8 | 35.2 | +| Granite-8b-code-instruct | 8 | 34.2 | +| CodeLlama-7b-hf | 7 | 31.8 | +| Gemma-7B | 7 | 27.2 | +| DeepSeek coder-7b-instruct | 7 | 26.8 | +| Llama-3-8B-Instruct | 8 | 26.8 | +| Mistral-7B-v0.1 | 7 | 16.1 | + +
+ +**Fine-tuning impact:** + +| Model name | Model size [B] | Fine-tuned | Better than gpt-4 rate [%] | +|---------------------------|----------------|------------|----------------------------| +| DeepSeek 34B-instruct | 34 | yes | 40.7 | +| DeepSeek 34B-instruct | 34 | no | 3.6 | + +## Results analysis + +- **Fine-tuning is a must** - without fine-tuning, open-source models provide poor results on most real-world code tasks, which include complicated prompt and lengthy context. We clearly see that without fine-tuning, deepseek model was 96.4% of the time inferior to GPT-4, while after fine-tuning, it is better 40.7% of the time. +- **Always start from a code-dedicated model** โ€” When fine-tuning, always start from a code-dedicated model, and not from a general-usage model. The gaps in downstream results are very big. +- **Don't believe the hype** โ€”newer models, or models from big-tech companies (Llama3, Gemma, Mistral), are not always better for fine-tuning. +- **The best large model** - For large 34B code-dedicated models, the gaps when doing proper fine-tuning are small. The current top model is **DeepSeek 34B-instruct** +- **The best small model** - For small 7B code-dedicated models, the gaps when fine-tuning are much larger. **CodeQWEN 1.5-7B** is by far the best model for fine-tuning. +- **Base vs. instruct** - For the top model (deepseek), we saw small advantage when starting from the instruct version. However, we recommend testing both versions on each specific task, as the base model is generally considered more suitable for fine-tuning. + +## The dataset + +### Training dataset + +Our training dataset comprises 25,000 pull requests, aggregated from permissive license repos. For each pull request, we generated responses for the three main tools of PR-Agent: +[Describe](https://pr-agent-docs.codium.ai/tools/describe/), [Review](https://pr-agent-docs.codium.ai/tools/improve/) and [Improve](https://pr-agent-docs.codium.ai/tools/improve/). + +On the raw data collected, we employed various automatic and manual cleaning techniques to ensure the outputs were of the highest quality, and suitable for instruct-tuning. + +Here are the prompts, and example outputs, used as input-output pairs to fine-tune the models: + +| Tool | Prompt | Example output | +|----------|------------------------------------------------------------------------------------------------------------|----------------| +| Describe | [link](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_description_prompts.toml) | [link](https://github.com/Codium-ai/pr-agent/pull/910#issue-2303989601) | +| Review | [link](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_reviewer_prompts.toml) | [link](https://github.com/Codium-ai/pr-agent/pull/910#issuecomment-2118761219) | +| Improve | [link](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_code_suggestions_prompts.toml) | [link](https://github.com/Codium-ai/pr-agent/pull/910#issuecomment-2118761309) | + +### Evaluation dataset + +- For each tool, we aggregated 100 additional examples to be used for evaluation. These examples were not used in the training dataset, and were manually selected to represent diverse real-world use-cases. +- For each test example, we generated two responses: one from the fine-tuned model, and one from the best code model in the world, `gpt-4-turbo-2024-04-09`. + +- We used a third LLM to judge which response better answers the prompt, and will likely be perceived by a human as better response. +
+ +We experimented with three model as judges: `gpt-4-turbo-2024-04-09`, `gpt-4o`, and `claude-3-opus-20240229`. All three produced similar results, with the same ranking order. This strengthens the validity of our testing protocol. +The evaluation prompt can be found [here](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_evaluate_prompt_response.toml) + +Here is an example of a judge model feedback: + +``` +command: improve +model1_score: 9, +model2_score: 6, +why: | + Response 1 is better because it provides more actionable and specific suggestions that directly + enhance the code's maintainability, performance, and best practices. For example, it suggests + using a variable for reusable widget instances and using named routes for navigation, which + are practical improvements. In contrast, Response 2 focuses more on general advice and less + actionable suggestions, such as changing variable names and adding comments, which are less + critical for immediate code improvement." +``` diff --git a/docs/docs/index.md b/docs/docs/index.md new file mode 100644 index 000000000..e0b5bc413 --- /dev/null +++ b/docs/docs/index.md @@ -0,0 +1,94 @@ +# Overview + +CodiumAI PR-Agent is an open-source tool to help efficiently review and handle pull requests. + +- See the [Installation Guide](./installation/index.md) for instructions on installing and running the tool on different git platforms. + +- See the [Usage Guide](./usage-guide/index.md) for instructions on running the PR-Agent commands via different interfaces, including _CLI_, _online usage_, or by _automatically triggering_ them when a new PR is opened. + +- See the [Tools Guide](./tools/index.md) for a detailed description of the different tools. + + +## PR-Agent Docs Smart Search + +To search the documentation site using natural language: + +1) Comment `/help "your question"` in either: + + - A pull request where PR-Agent is installed + - A [PR Chat](https://pr-agent-docs.codium.ai/chrome-extension/features/#pr-chat) + +2) PR-Agent will respond with an [answer](https://github.com/Codium-ai/pr-agent/pull/1241#issuecomment-2365259334) that includes relevant documentation links. + + +## PR-Agent Features + +PR-Agent offers extensive pull request functionalities across various git providers. + +| | | GitHub | Gitlab | Bitbucket | Azure DevOps | +|-------|-----------------------------------------------------------------------------------------------------------------------|:------:|:------:|:---------:|:------------:| +| TOOLS | Review | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ Incremental | โœ… | | | | +| | โฎ‘ [SOC2 Compliance](https://pr-agent-docs.codium.ai/tools/review/#soc2-ticket-compliance){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | | +| | Ask | โœ… | โœ… | โœ… | โœ… | +| | Describe | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ [Inline file summary](https://pr-agent-docs.codium.ai/tools/describe/#inline-file-summary){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | | +| | Improve | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ Extended | โœ… | โœ… | โœ… | โœ… | +| | [Custom Prompt](./tools/custom_prompt.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | | +| | Reflect and Review | โœ… | โœ… | โœ… | | +| | Update CHANGELOG.md | โœ… | โœ… | โœ… | ๏ธ | +| | Find Similar Issue | โœ… | | | ๏ธ | +| | [Add PR Documentation](./tools/documentation.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | | +| | [Generate Custom Labels](./tools/describe.md#handle-custom-labels-from-the-repos-labels-page-๐Ÿ’Ž){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | | +| | [Analyze PR Components](./tools/analyze.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | | +| | | | | | ๏ธ | +| USAGE | CLI | โœ… | โœ… | โœ… | โœ… | +| | App / webhook | โœ… | โœ… | โœ… | โœ… | +| | Actions | โœ… | | | ๏ธ | +| | | | | | +| CORE | PR compression | โœ… | โœ… | โœ… | โœ… | +| | Repo language prioritization | โœ… | โœ… | โœ… | โœ… | +| | Adaptive and token-aware file patch fitting | โœ… | โœ… | โœ… | โœ… | +| | Multiple models support | โœ… | โœ… | โœ… | โœ… | +| | Incremental PR review | โœ… | | | | +| | [Static code analysis](./tools/analyze.md/){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | | +| | [Multiple configuration options](./usage-guide/configuration_options.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | | + +๐Ÿ’Ž marks a feature available only in [PR-Agent Pro](https://www.codium.ai/pricing/){:target="_blank"} + + +## Example Results +
+ +#### [/describe](https://github.com/Codium-ai/pr-agent/pull/530) +
+![/describe](https://www.codium.ai/images/pr_agent/describe_new_short_main.png){width=512} +
+
+ +#### [/review](https://github.com/Codium-ai/pr-agent/pull/732#issuecomment-1975099151) +
+![/review](https://www.codium.ai/images/pr_agent/review_new_short_main.png){width=512} +
+
+ +#### [/improve](https://github.com/Codium-ai/pr-agent/pull/732#issuecomment-1975099159) +
+![/improve](https://www.codium.ai/images/pr_agent/improve_new_short_main.png){width=512} +
+
+ +#### [/generate_labels](https://github.com/Codium-ai/pr-agent/pull/530) +
+![/generate_labels](https://www.codium.ai/images/pr_agent/geneare_custom_labels_main_short.png){width=300} +
+
+ +## How it Works + +The following diagram illustrates PR-Agent tools and their flow: + +![PR-Agent Tools](https://codium.ai/images/pr_agent/diagram-v0.9.png) + +Check out the [core abilities](core-abilities/index.md) page for a comprehensive overview of the variety of core abilities used by PR-Agent. diff --git a/docs/docs/installation/azure.md b/docs/docs/installation/azure.md new file mode 100644 index 000000000..ba97dbe6f --- /dev/null +++ b/docs/docs/installation/azure.md @@ -0,0 +1,93 @@ +## Azure DevOps Pipeline +You can use a pre-built Action Docker image to run PR-Agent as an Azure devops pipeline. +add the following file to your repository under `azure-pipelines.yml`: +```yaml +# Opt out of CI triggers +trigger: none + +# Configure PR trigger +pr: + branches: + include: + - '*' + autoCancel: true + drafts: false + +stages: +- stage: pr_agent + displayName: 'PR Agent Stage' + jobs: + - job: pr_agent_job + displayName: 'PR Agent Job' + pool: + vmImage: 'ubuntu-latest' + container: + image: codiumai/pr-agent:latest + options: --entrypoint "" + variables: + - group: pr_agent + steps: + - script: | + echo "Running PR Agent action step" + + # Construct PR_URL + PR_URL="${SYSTEM_COLLECTIONURI}${SYSTEM_TEAMPROJECT}/_git/${BUILD_REPOSITORY_NAME}/pullrequest/${SYSTEM_PULLREQUEST_PULLREQUESTID}" + echo "PR_URL=$PR_URL" + + # Extract organization URL from System.CollectionUri + ORG_URL=$(echo "$(System.CollectionUri)" | sed 's/\/$//') # Remove trailing slash if present + echo "Organization URL: $ORG_URL" + + export azure_devops__org="$ORG_URL" + export config__git_provider="azure" + + pr-agent --pr_url="$PR_URL" describe + pr-agent --pr_url="$PR_URL" review + pr-agent --pr_url="$PR_URL" improve + env: + azure_devops__pat: $(azure_devops_pat) + openai__key: $(OPENAI_KEY) + displayName: 'Run PR Agent' +``` +This script will run PR-Agent on every new merge request, with the `improve`, `review`, and `describe` commands. +Note that you need to export the `azure_devops__pat` and `OPENAI_KEY` variables in the Azure DevOps pipeline settings (Pipelines -> Library -> + Variable group): +![PR Agent Pro](https://codium.ai/images/pr_agent/azure_devops_pipeline_secrets.png){width=468} + +Make sure to give pipeline permissions to the `pr_agent` variable group. + + +## Azure DevOps from CLI + +To use Azure DevOps provider use the following settings in configuration.toml: +``` +[config] +git_provider="azure" +``` + +Azure DevOps provider supports [PAT token](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?view=azure-devops&tabs=Windows) or [DefaultAzureCredential](https://learn.microsoft.com/en-us/azure/developer/python/sdk/authentication-overview#authentication-in-server-environments) authentication. +PAT is faster to create, but has build in expiration date, and will use the user identity for API calls. +Using DefaultAzureCredential you can use managed identity or Service principle, which are more secure and will create separate ADO user identity (via AAD) to the agent. + +If PAT was chosen, you can assign the value in .secrets.toml. +If DefaultAzureCredential was chosen, you can assigned the additional env vars like AZURE_CLIENT_SECRET directly, +or use managed identity/az cli (for local development) without any additional configuration. +in any case, 'org' value must be assigned in .secrets.toml: +``` +[azure_devops] +org = "https://dev.azure.com/YOUR_ORGANIZATION/" +# pat = "YOUR_PAT_TOKEN" needed only if using PAT for authentication +``` + +### Azure DevOps Webhook + +To trigger from an Azure webhook, you need to manually [add a webhook](https://learn.microsoft.com/en-us/azure/devops/service-hooks/services/webhooks?view=azure-devops). +Use the "Pull request created" type to trigger a review, or "Pull request commented on" to trigger any supported comment with / comment on the relevant PR. Note that for the "Pull request commented on" trigger, only API v2.0 is supported. + + +For webhook security, create a sporadic username/password pair and configure the webhook username and password on both the server and Azure DevOps webhook. These will be sent as basic Auth data by the webhook with each request: +``` +[azure_devops_server] +webhook_username = "" +webhook_password = "" +``` +> :warning: **Ensure that the webhook endpoint is only accessible over HTTPS** to mitigate the risk of credential interception when using basic authentication. diff --git a/docs/docs/installation/bitbucket.md b/docs/docs/installation/bitbucket.md new file mode 100644 index 000000000..9023399bb --- /dev/null +++ b/docs/docs/installation/bitbucket.md @@ -0,0 +1,70 @@ +## Run as a Bitbucket Pipeline + + +You can use the Bitbucket Pipeline system to run PR-Agent on every pull request open or update. + +1. Add the following file in your repository bitbucket_pipelines.yml + +```yaml +pipelines: + pull-requests: + '**': + - step: + name: PR Agent Review + image: python:3.10 + services: + - docker + script: + - docker run -e CONFIG.GIT_PROVIDER=bitbucket -e OPENAI.KEY=$OPENAI_API_KEY -e BITBUCKET.BEARER_TOKEN=$BITBUCKET_BEARER_TOKEN codiumai/pr-agent:latest --pr_url=https://bitbucket.org/$BITBUCKET_WORKSPACE/$BITBUCKET_REPO_SLUG/pull-requests/$BITBUCKET_PR_ID review +``` + +2. Add the following secure variables to your repository under Repository settings > Pipelines > Repository variables. +OPENAI_API_KEY: `` +BITBUCKET_BEARER_TOKEN: `` + +You can get a Bitbucket token for your repository by following Repository Settings -> Security -> Access Tokens. + +Note that comments on a PR are not supported in Bitbucket Pipeline. + + +## Run using CodiumAI-hosted Bitbucket app ๐Ÿ’Ž + +Please contact visit [PR-Agent pro](https://www.codium.ai/pricing/) if you're interested in a hosted BitBucket app solution that provides full functionality including PR reviews and comment handling. It's based on the [bitbucket_app.py](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/git_providers/bitbucket_provider.py) implementation. + + +## Bitbucket Server and Data Center + +Login into your on-prem instance of Bitbucket with your service account username and password. +Navigate to `Manage account`, `HTTP Access tokens`, `Create Token`. +Generate the token and add it to .secret.toml under `bitbucket_server` section + +```toml +[bitbucket_server] +bearer_token = "" +``` + +### Run it as CLI + +Modify `configuration.toml`: + +```toml +git_provider="bitbucket_server" +``` + +and pass the Pull request URL: +```shell +python cli.py --pr_url https://git.onpreminstanceofbitbucket.com/projects/PROJECT/repos/REPO/pull-requests/1 review +``` + +### Run it as service + +To run pr-agent as webhook, build the docker image: +``` +docker build . -t codiumai/pr-agent:bitbucket_server_webhook --target bitbucket_server_webhook -f docker/Dockerfile +docker push codiumai/pr-agent:bitbucket_server_webhook # Push to your Docker repository +``` + +Navigate to `Projects` or `Repositories`, `Settings`, `Webhooks`, `Create Webhook`. +Fill the name and URL, Authentication None select the Pull Request Opened checkbox to receive that event as webhook. + +The URL should end with `/webhook`, for example: https://domain.com/webhook diff --git a/docs/docs/installation/github.md b/docs/docs/installation/github.md new file mode 100644 index 000000000..83cd09813 --- /dev/null +++ b/docs/docs/installation/github.md @@ -0,0 +1,255 @@ +## Run as a GitHub Action + +You can use our pre-built Github Action Docker image to run PR-Agent as a Github Action. + +1) Add the following file to your repository under `.github/workflows/pr_agent.yml`: + +```yaml +on: + pull_request: + types: [opened, reopened, ready_for_review] + issue_comment: +jobs: + pr_agent_job: + if: ${{ github.event.sender.type != 'Bot' }} + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + contents: write + name: Run pr agent on every pull request, respond to user comments + steps: + - name: PR Agent action step + id: pragent + uses: Codium-ai/pr-agent@main + env: + OPENAI_KEY: ${{ secrets.OPENAI_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +``` + + +if you want to pin your action to a specific release (v0.23 for example) for stability reasons, use: +```yaml +... + steps: + - name: PR Agent action step + id: pragent + uses: docker://codiumai/pr-agent:0.23-github_action +... +``` + +For enhanced security, you can also specify the Docker image by its [digest](https://hub.docker.com/repository/docker/codiumai/pr-agent/tags): +```yaml +... + steps: + - name: PR Agent action step + id: pragent + uses: docker://codiumai/pr-agent@sha256:14165e525678ace7d9b51cda8652c2d74abb4e1d76b57c4a6ccaeba84663cc64 +... +``` + +2) Add the following secret to your repository under `Settings > Secrets and variables > Actions > New repository secret > Add secret`: + +``` +Name = OPENAI_KEY +Secret = +``` + +The GITHUB_TOKEN secret is automatically created by GitHub. + +3) Merge this change to your main branch. +When you open your next PR, you should see a comment from `github-actions` bot with a review of your PR, and instructions on how to use the rest of the tools. + +4) You may configure PR-Agent by adding environment variables under the env section corresponding to any configurable property in the [configuration](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) file. Some examples: +```yaml + env: + # ... previous environment values + OPENAI.ORG: "" + PR_REVIEWER.REQUIRE_TESTS_REVIEW: "false" # Disable tests review + PR_CODE_SUGGESTIONS.NUM_CODE_SUGGESTIONS: 6 # Increase number of code suggestions +``` +See detailed usage instructions in the [USAGE GUIDE](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-action) + +--- + +## Run as a GitHub App +Allowing you to automate the review process on your private or public repositories. + +1) Create a GitHub App from the [Github Developer Portal](https://docs.github.com/en/developers/apps/creating-a-github-app). + + - Set the following permissions: + - Pull requests: Read & write + - Issue comment: Read & write + - Metadata: Read-only + - Contents: Read-only + - Set the following events: + - Issue comment + - Pull request + - Push (if you need to enable triggering on PR update) + +2) Generate a random secret for your app, and save it for later. For example, you can use: + +``` +WEBHOOK_SECRET=$(python -c "import secrets; print(secrets.token_hex(10))") +``` + +3) Acquire the following pieces of information from your app's settings page: + + - App private key (click "Generate a private key" and save the file) + - App ID + +4) Clone this repository: + +``` +git clone https://github.com/Codium-ai/pr-agent.git +``` + +5) Copy the secrets template file and fill in the following: + +``` +cp pr_agent/settings/.secrets_template.toml pr_agent/settings/.secrets.toml +# Edit .secrets.toml file +``` + + - Your OpenAI key. + - Copy your app's private key to the private_key field. + - Copy your app's ID to the app_id field. + - Copy your app's webhook secret to the webhook_secret field. + - Set deployment_type to 'app' in [configuration.toml](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) + + > The .secrets.toml file is not copied to the Docker image by default, and is only used for local development. + > If you want to use the .secrets.toml file in your Docker image, you can add remove it from the .dockerignore file. + > In most production environments, you would inject the secrets file as environment variables or as mounted volumes. + > For example, in order to inject a secrets file as a volume in a Kubernetes environment you can update your pod spec to include the following, + > assuming you have a secret named `pr-agent-settings` with a key named `.secrets.toml`: + ``` + volumes: + - name: settings-volume + secret: + secretName: pr-agent-settings + // ... + containers: + // ... + volumeMounts: + - mountPath: /app/pr_agent/settings_prod + name: settings-volume + ``` + + > Another option is to set the secrets as environment variables in your deployment environment, for example `OPENAI.KEY` and `GITHUB.USER_TOKEN`. + +6) Build a Docker image for the app and optionally push it to a Docker repository. We'll use Dockerhub as an example: + + ``` + docker build . -t codiumai/pr-agent:github_app --target github_app -f docker/Dockerfile + docker push codiumai/pr-agent:github_app # Push to your Docker repository + ``` + +7. Host the app using a server, serverless function, or container environment. Alternatively, for development and + debugging, you may use tools like smee.io to forward webhooks to your local machine. + You can check [Deploy as a Lambda Function](#deploy-as-a-lambda-function) + +8. Go back to your app's settings, and set the following: + + - Webhook URL: The URL of your app's server or the URL of the smee.io channel. + - Webhook secret: The secret you generated earlier. + +9. Install the app by navigating to the "Install App" tab and selecting your desired repositories. + +> **Note:** When running PR-Agent from GitHub App, the default configuration file (configuration.toml) will be loaded. +> However, you can override the default tool parameters by uploading a local configuration file `.pr_agent.toml` +> For more information please check out the [USAGE GUIDE](../usage-guide/automations_and_usage.md#github-app) +--- + +## Deploy as a Lambda Function + +Note that since AWS Lambda env vars cannot have "." in the name, you can replace each "." in an env variable with "__".
+For example: `GITHUB.WEBHOOK_SECRET` --> `GITHUB__WEBHOOK_SECRET` + +1. Follow steps 1-5 from [here](#run-as-a-github-app). +2. Build a docker image that can be used as a lambda function + ```shell + docker buildx build --platform=linux/amd64 . -t codiumai/pr-agent:serverless -f docker/Dockerfile.lambda + ``` +3. Push image to ECR + ```shell + docker tag codiumai/pr-agent:serverless .dkr.ecr..amazonaws.com/codiumai/pr-agent:serverless + docker push .dkr.ecr..amazonaws.com/codiumai/pr-agent:serverless + ``` +4. Create a lambda function that uses the uploaded image. Set the lambda timeout to be at least 3m. +5. Configure the lambda function to have a Function URL. +6. In the environment variables of the Lambda function, specify `AZURE_DEVOPS_CACHE_DIR` to a writable location such as /tmp. (see [link](https://github.com/Codium-ai/pr-agent/pull/450#issuecomment-1840242269)) +7. Go back to steps 8-9 of [Method 5](#run-as-a-github-app) with the function url as your Webhook URL. + The Webhook URL would look like `https:///api/v1/github_webhooks` + +--- + +## AWS CodeCommit Setup + +Not all features have been added to CodeCommit yet. As of right now, CodeCommit has been implemented to run the pr-agent CLI on the command line, using AWS credentials stored in environment variables. (More features will be added in the future.) The following is a set of instructions to have pr-agent do a review of your CodeCommit pull request from the command line: + +1. Create an IAM user that you will use to read CodeCommit pull requests and post comments + * Note: That user should have CLI access only, not Console access +2. Add IAM permissions to that user, to allow access to CodeCommit (see IAM Role example below) +3. Generate an Access Key for your IAM user +4. Set the Access Key and Secret using environment variables (see Access Key example below) +5. Set the `git_provider` value to `codecommit` in the `pr_agent/settings/configuration.toml` settings file +6. Set the `PYTHONPATH` to include your `pr-agent` project directory + * Option A: Add `PYTHONPATH="/PATH/TO/PROJECTS/pr-agent` to your `.env` file + * Option B: Set `PYTHONPATH` and run the CLI in one command, for example: + * `PYTHONPATH="/PATH/TO/PROJECTS/pr-agent python pr_agent/cli.py [--ARGS]` + +--- + + +#### AWS CodeCommit IAM Role Example + +Example IAM permissions to that user to allow access to CodeCommit: + +* Note: The following is a working example of IAM permissions that has read access to the repositories and write access to allow posting comments +* Note: If you only want pr-agent to review your pull requests, you can tighten the IAM permissions further, however this IAM example will work, and allow the pr-agent to post comments to the PR +* Note: You may want to replace the `"Resource": "*"` with your list of repos, to limit access to only those repos + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "codecommit:BatchDescribe*", + "codecommit:BatchGet*", + "codecommit:Describe*", + "codecommit:EvaluatePullRequestApprovalRules", + "codecommit:Get*", + "codecommit:List*", + "codecommit:PostComment*", + "codecommit:PutCommentReaction", + "codecommit:UpdatePullRequestDescription", + "codecommit:UpdatePullRequestTitle" + ], + "Resource": "*" + } + ] +} +``` + +#### AWS CodeCommit Access Key and Secret + +Example setting the Access Key and Secret using environment variables + +```sh +export AWS_ACCESS_KEY_ID="XXXXXXXXXXXXXXXX" +export AWS_SECRET_ACCESS_KEY="XXXXXXXXXXXXXXXX" +export AWS_DEFAULT_REGION="us-east-1" +``` + +#### AWS CodeCommit CLI Example + +After you set up AWS CodeCommit using the instructions above, here is an example CLI run that tells pr-agent to **review** a given pull request. +(Replace your specific PYTHONPATH and PR URL in the example) + +```sh +PYTHONPATH="/PATH/TO/PROJECTS/pr-agent" python pr_agent/cli.py \ + --pr_url https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/MY_REPO_NAME/pull-requests/321 \ + review +``` diff --git a/docs/docs/installation/gitlab.md b/docs/docs/installation/gitlab.md new file mode 100644 index 000000000..07ed75fc1 --- /dev/null +++ b/docs/docs/installation/gitlab.md @@ -0,0 +1,62 @@ +## Run as a GitLab Pipeline +You can use a pre-built Action Docker image to run PR-Agent as a GitLab pipeline. This is a simple way to get started with PR-Agent without setting up your own server. + +(1) Add the following file to your repository under `.gitlab-ci.yml`: +```yaml +stages: + - pr_agent + +pr_agent_job: + stage: pr_agent + image: + name: codiumai/pr-agent:latest + entrypoint: [""] + script: + - cd /app + - echo "Running PR Agent action step" + - export MR_URL="$CI_MERGE_REQUEST_PROJECT_URL/merge_requests/$CI_MERGE_REQUEST_IID" + - echo "MR_URL=$MR_URL" + - export gitlab__url=$CI_SERVER_PROTOCOL://$CI_SERVER_FQDN + - export gitlab__PERSONAL_ACCESS_TOKEN=$GITLAB_PERSONAL_ACCESS_TOKEN + - export config__git_provider="gitlab" + - export openai__key=$OPENAI_KEY + - python -m pr_agent.cli --pr_url="$MR_URL" describe + - python -m pr_agent.cli --pr_url="$MR_URL" review + - python -m pr_agent.cli --pr_url="$MR_URL" improve + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' +``` +This script will run PR-Agent on every new merge request. You can modify the `rules` section to run PR-Agent on different events. +You can also modify the `script` section to run different PR-Agent commands, or with different parameters by exporting different environment variables. + + +(2) Add the following masked variables to your GitLab repository (CI/CD -> Variables): + +- `GITLAB_PERSONAL_ACCESS_TOKEN`: Your GitLab personal access token. + +- `OPENAI_KEY`: Your OpenAI key. + +Note that if your base branches are not protected, don't set the variables as `protected`, since the pipeline will not have access to them. + + + +## Run a GitLab webhook server + +1. From the GitLab workspace or group, create an access token. Enable the "api" scope only. + +2. Generate a random secret for your app, and save it for later. For example, you can use: + +``` +WEBHOOK_SECRET=$(python -c "import secrets; print(secrets.token_hex(10))") +``` +3. Follow the instructions to build the Docker image, setup a secrets file and deploy on your own server from [here](https://pr-agent-docs.codium.ai/installation/github/#run-as-a-github-app) steps 4-7. + +4. In the secrets file, fill in the following: + - Your OpenAI key. + - In the [gitlab] section, fill in personal_access_token and shared_secret. The access token can be a personal access token, or a group or project access token. + - Set deployment_type to 'gitlab' in [configuration.toml](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) + +5. Create a webhook in GitLab. Set the URL to ```http[s]:///webhook```. Set the secret token to the generated secret from step 2. +In the "Trigger" section, check the โ€˜commentsโ€™ and โ€˜merge request eventsโ€™ boxes. + +6. Test your installation by opening a merge request or commenting or a merge request using one of CodiumAI's commands. diff --git a/docs/docs/installation/index.md b/docs/docs/installation/index.md new file mode 100644 index 000000000..905b86b3a --- /dev/null +++ b/docs/docs/installation/index.md @@ -0,0 +1,21 @@ +# Installation + +## Self-hosted PR-Agent +If you choose to host you own PR-Agent, you first need to acquire two tokens: + +1. An OpenAI key from [here](https://platform.openai.com/api-keys), with access to GPT-4 (or a key for other [language models](https://pr-agent-docs.codium.ai/usage-guide/changing_a_model/), if you prefer). +2. A GitHub\GitLab\BitBucket personal access token (classic), with the repo scope. [GitHub from [here](https://github.com/settings/tokens)] + +There are several ways to use self-hosted PR-Agent: + +- [Locally](./locally.md) +- [GitHub](./github.md) +- [GitLab](./gitlab.md) +- [BitBucket](./bitbucket.md) +- [Azure DevOps](./azure.md) + +## PR-Agent Pro ๐Ÿ’Ž +PR-Agent Pro, an app hosted by CodiumAI for GitHub\GitLab\BitBucket, is also available. +
+With PR-Agent Pro, installation is as simple as signing up and adding the PR-Agent app to your relevant repo. +See [here](https://pr-agent-docs.codium.ai/installation/pr_agent_pro/) for more details. \ No newline at end of file diff --git a/docs/docs/installation/locally.md b/docs/docs/installation/locally.md new file mode 100644 index 000000000..123a6c033 --- /dev/null +++ b/docs/docs/installation/locally.md @@ -0,0 +1,121 @@ +## Using pip package + +Install the package: + +``` +pip install pr-agent +``` + +Then run the relevant tool with the script below. +
+Make sure to fill in the required parameters (`user_token`, `openai_key`, `pr_url`, `command`): + +```python +from pr_agent import cli +from pr_agent.config_loader import get_settings + +def main(): + # Fill in the following values + provider = "github" # GitHub provider + user_token = "..." # GitHub user token + openai_key = "..." # OpenAI key + pr_url = "..." # PR URL, for example 'https://github.com/Codium-ai/pr-agent/pull/809' + command = "/review" # Command to run (e.g. '/review', '/describe', '/ask="What is the purpose of this PR?"', ...) + + # Setting the configurations + get_settings().set("CONFIG.git_provider", provider) + get_settings().set("openai.key", openai_key) + get_settings().set("github.user_token", user_token) + + # Run the command. Feedback will appear in GitHub PR comments + cli.run_command(pr_url, command) + + +if __name__ == '__main__': + main() +``` + +## Using Docker image + +A list of the relevant tools can be found in the [tools guide](../tools/ask.md). + +To invoke a tool (for example `review`), you can run directly from the Docker image. Here's how: + +- For GitHub: +``` +docker run --rm -it -e OPENAI.KEY= -e GITHUB.USER_TOKEN= codiumai/pr-agent:latest --pr_url review +``` + +- For GitLab: +``` +docker run --rm -it -e OPENAI.KEY= -e CONFIG.GIT_PROVIDER=gitlab -e GITLAB.PERSONAL_ACCESS_TOKEN= codiumai/pr-agent:latest --pr_url review +``` + +Note: If you have a dedicated GitLab instance, you need to specify the custom url as variable: +``` +docker run --rm -it -e OPENAI.KEY= -e CONFIG.GIT_PROVIDER=gitlab -e GITLAB.PERSONAL_ACCESS_TOKEN= -e GITLAB.URL= codiumai/pr-agent:latest --pr_url review +``` + +- For BitBucket: +``` +docker run --rm -it -e CONFIG.GIT_PROVIDER=bitbucket -e OPENAI.KEY=$OPENAI_API_KEY -e BITBUCKET.BEARER_TOKEN=$BITBUCKET_BEARER_TOKEN codiumai/pr-agent:latest --pr_url= review +``` + +For other git providers, update CONFIG.GIT_PROVIDER accordingly, and check the `pr_agent/settings/.secrets_template.toml` file for the environment variables expected names and values. + +--- + + +If you want to ensure you're running a specific version of the Docker image, consider using the image's digest: +```bash +docker run --rm -it -e OPENAI.KEY= -e GITHUB.USER_TOKEN= codiumai/pr-agent@sha256:71b5ee15df59c745d352d84752d01561ba64b6d51327f97d46152f0c58a5f678 --pr_url review +``` + +Or you can run a [specific released versions](https://github.com/Codium-ai/pr-agent/blob/main/RELEASE_NOTES.md) of pr-agent, for example: +``` +codiumai/pr-agent@v0.9 +``` + +--- + +## Run from source + +1. Clone this repository: + +``` +git clone https://github.com/Codium-ai/pr-agent.git +``` + +2. Navigate to the `/pr-agent` folder and install the requirements in your favorite virtual environment: + +``` +pip install -e . +``` + +*Note: If you get an error related to Rust in the dependency installation then make sure Rust is installed and in your `PATH`, instructions: https://rustup.rs* + +3. Copy the secrets template file and fill in your OpenAI key and your GitHub user token: + +``` +cp pr_agent/settings/.secrets_template.toml pr_agent/settings/.secrets.toml +chmod 600 pr_agent/settings/.secrets.toml +# Edit .secrets.toml file +``` + +4. Run the cli.py script: + +``` +python3 -m pr_agent.cli --pr_url review +python3 -m pr_agent.cli --pr_url ask +python3 -m pr_agent.cli --pr_url describe +python3 -m pr_agent.cli --pr_url improve +python3 -m pr_agent.cli --pr_url add_docs +python3 -m pr_agent.cli --pr_url generate_labels +python3 -m pr_agent.cli --issue_url similar_issue +... +``` + +[Optional]ย Add the pr_agent folder to your PYTHONPATH +``` +export PYTHONPATH=$PYTHONPATH: +``` \ No newline at end of file diff --git a/docs/docs/installation/pr_agent_pro.md b/docs/docs/installation/pr_agent_pro.md new file mode 100644 index 000000000..44c9c0279 --- /dev/null +++ b/docs/docs/installation/pr_agent_pro.md @@ -0,0 +1,68 @@ + +## Getting Started with PR-Agent Pro + +PR-Agent Pro is a versatile application compatible with GitHub, GitLab, and BitBucket, hosted by CodiumAI. +See [here](https://pr-agent-docs.codium.ai/#pr-agent-pro) for more details about the benefits of using PR-Agent Pro. + +Interested parties can subscribe to PR-Agent Pro through the following [link](https://www.codium.ai/pricing/). +After subscribing, you are granted the ability to easily install the application across any of your repositories. + +![PR Agent Pro](https://codium.ai/images/pr_agent/pr_agent_pro_install.png){width=468} + +Each user who wants to use PR-Agent pro needs to buy a seat. +Initially, CodiumAI offers a two-week trial period at no cost, after which continued access requires each user to secure a personal seat. +Once a user acquires a seat, they gain the flexibility to use PR-Agent Pro across any repository where it was enabled. + +Users without a purchased seat who interact with a repository featuring PR-Agent Pro are entitled to receive up to five complimentary feedbacks. +Beyond this limit, PR-Agent Pro will cease to respond to their inquiries unless a seat is purchased. + +## Install PR-Agent Pro for GitHub Enterprise Server +You can install PR-Agent Pro application on your GitHub Enterprise Server, and enjoy two weeks of free trial. +After the trial period, to continue using PR-Agent Pro, you will need to contact us for an [Enterprise license](https://www.codium.ai/pricing/). + + +## Install PR-Agent Pro for GitLab (Teams & Enterprise) + +Since GitLab platform does not support apps, installing PR-Agent Pro for GitLab is a bit more involved, and requires the following steps: + +### Step 1 + +Acquire a personal, project or group level access token. Enable the โ€œapiโ€ scope in order to allow PR-Agent to read pull requests, comment and respond to requests. + +
+![Step 1](https://www.codium.ai/images/pr_agent/gitlab_pro_pat.png){width=750} +
+ +Store the token in a safe place, you wonโ€™t be able to access it again after it was generated. + +### Step 2 + +Generate a shared secret and link it to the access token. Browse to [https://register.gitlab.pr-agent.codium.ai](https://register.gitlab.pr-agent.codium.ai). +Fill in your generated GitLab token and your company or personal name in the appropriate fields and click "Submit". + +You should see "Success!" displayed above the Submit button, and a shared secret will be generated. Store it in a safe place, you wonโ€™t be able to access it again after it was generated. + +### Step 3 + +Install a webhook for your repository or groups, by clicking โ€œwebhooksโ€ on the settings menu. Click the โ€œAdd new webhookโ€ button. + +
+![Step 3.1](https://www.codium.ai/images/pr_agent/gitlab_pro_add_webhook.png) +
+ +In the webhook definition form, fill in the following fields: +URL: https://pro.gitlab.pr-agent.codium.ai/webhook + +Secret token: Your CodiumAI key +Trigger: Check the โ€˜commentsโ€™ and โ€˜merge request eventsโ€™ boxes. +Enable SSL verification: Check the box. + +
+![Step 3.2](https://www.codium.ai/images/pr_agent/gitlab_pro_webhooks.png){width=750} +
+ +### Step 4 + +Youโ€™re all set! + +Open a new merge request or add a MR comment with one of PR-Agentโ€™s commands such as /review, /describe or /improve. \ No newline at end of file diff --git a/docs/docs/overview/data_privacy.md b/docs/docs/overview/data_privacy.md new file mode 100644 index 000000000..dff5ca601 --- /dev/null +++ b/docs/docs/overview/data_privacy.md @@ -0,0 +1,16 @@ +## Self-hosted PR-Agent + +- If you self-host PR-Agent with your OpenAI (or other LLM provider) API key, it is between you and the provider. We don't send your code data to PR-Agent servers. + +## PR-Agent Pro ๐Ÿ’Ž + +- When using PR-Agent Pro ๐Ÿ’Ž, hosted by CodiumAI, we will not store any of your data, nor will we use it for training. You will also benefit from an OpenAI account with zero data retention. + +- For certain clients, CodiumAI-hosted PR-Agent Pro will use CodiumAIโ€™s proprietary models. If this is the case, you will be notified. + +- No passive collection of Code and Pull Requestsโ€™ data โ€” PR-Agent will be active only when you invoke it, and it will then extract and analyze only data relevant to the executed command and queried pull request. + + +## PR-Agent Chrome extension + +- The [PR-Agent Chrome extension](https://chromewebstore.google.com/detail/pr-agent-chrome-extension/ephlnjeghhogofkifjloamocljapahnl) will not send your code to any external servers. diff --git a/docs/docs/overview/index.md b/docs/docs/overview/index.md new file mode 100644 index 000000000..19480a601 --- /dev/null +++ b/docs/docs/overview/index.md @@ -0,0 +1,94 @@ +# Overview + +CodiumAI PR-Agent is an open-source tool to help efficiently review and handle pull requests. + +- See the [Installation Guide](./installation/index.md) for instructions on installing and running the tool on different git platforms. + +- See the [Usage Guide](./usage-guide/index.md) for instructions on running the PR-Agent commands via different interfaces, including _CLI_, _online usage_, or by _automatically triggering_ them when a new PR is opened. + +- See the [Tools Guide](./tools/index.md) for a detailed description of the different tools. + + +## PR-Agent Docs Smart Search + +To search the documentation site using natural language: + +1) Comment `/help "your question"` in either: + + - A pull request where PR-Agent is installed + - A [PR Chat](https://pr-agent-docs.codium.ai/chrome-extension/features/#pr-chat) + +2) PR-Agent will respond with an [answer](https://github.com/Codium-ai/pr-agent/pull/1241#issuecomment-2365259334) that includes relevant documentation links. + + +## PR-Agent Features + +PR-Agent offers extensive pull request functionalities across various git providers. + +| | | GitHub | Gitlab | Bitbucket | Azure DevOps | +|-------|-----------------------------------------------------------------------------------------------------------------------|:------:|:------:|:---------:|:------------:| +| TOOLS | Review | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ Incremental | โœ… | | | | +| | โฎ‘ [SOC2 Compliance](https://pr-agent-docs.codium.ai/tools/review/#soc2-ticket-compliance){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | โœ… | +| | Ask | โœ… | โœ… | โœ… | โœ… | +| | Describe | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ [Inline file summary](https://pr-agent-docs.codium.ai/tools/describe/#inline-file-summary){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | โœ… | +| | Improve | โœ… | โœ… | โœ… | โœ… | +| | โฎ‘ Extended | โœ… | โœ… | โœ… | โœ… | +| | [Custom Prompt](./tools/custom_prompt.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | โœ… | +| | Reflect and Review | โœ… | โœ… | โœ… | โœ… | +| | Update CHANGELOG.md | โœ… | โœ… | โœ… | ๏ธ | +| | Find Similar Issue | โœ… | | | ๏ธ | +| | [Add PR Documentation](./tools/documentation.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | โœ… | +| | [Generate Custom Labels](./tools/describe.md#handle-custom-labels-from-the-repos-labels-page-๐Ÿ’Ž){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | โœ… | +| | [Analyze PR Components](./tools/analyze.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | | โœ… | +| | | | | | ๏ธ | +| USAGE | CLI | โœ… | โœ… | โœ… | โœ… | +| | App / webhook | โœ… | โœ… | โœ… | โœ… | +| | Actions | โœ… | | | ๏ธ | +| | | | | | +| CORE | PR compression | โœ… | โœ… | โœ… | โœ… | +| | Repo language prioritization | โœ… | โœ… | โœ… | โœ… | +| | Adaptive and token-aware file patch fitting | โœ… | โœ… | โœ… | โœ… | +| | Multiple models support | โœ… | โœ… | โœ… | โœ… | +| | Incremental PR review | โœ… | | | | +| | [Static code analysis](./tools/analyze.md/){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | โœ… | +| | [Multiple configuration options](./usage-guide/configuration_options.md){:target="_blank"} ๐Ÿ’Ž | โœ… | โœ… | โœ… | โœ… | + +๐Ÿ’Ž marks a feature available only in [PR-Agent Pro](https://www.codium.ai/pricing/){:target="_blank"} + + +## Example Results +
+ +#### [/describe](https://github.com/Codium-ai/pr-agent/pull/530) +
+![/describe](https://www.codium.ai/images/pr_agent/describe_new_short_main.png){width=512} +
+
+ +#### [/review](https://github.com/Codium-ai/pr-agent/pull/732#issuecomment-1975099151) +
+![/review](https://www.codium.ai/images/pr_agent/review_new_short_main.png){width=512} +
+
+ +#### [/improve](https://github.com/Codium-ai/pr-agent/pull/732#issuecomment-1975099159) +
+![/improve](https://www.codium.ai/images/pr_agent/improve_new_short_main.png){width=512} +
+
+ +#### [/generate_labels](https://github.com/Codium-ai/pr-agent/pull/530) +
+![/generate_labels](https://www.codium.ai/images/pr_agent/geneare_custom_labels_main_short.png){width=300} +
+
+ +## How it Works + +The following diagram illustrates PR-Agent tools and their flow: + +![PR-Agent Tools](https://codium.ai/images/pr_agent/diagram-v0.9.png) + +Check out the [PR Compression strategy](core-abilities/index.md) page for more details on how we convert a code diff to a manageable LLM prompt \ No newline at end of file diff --git a/docs/docs/overview/pr_agent_pro.md b/docs/docs/overview/pr_agent_pro.md new file mode 100644 index 000000000..962bd7d5e --- /dev/null +++ b/docs/docs/overview/pr_agent_pro.md @@ -0,0 +1,52 @@ +### Overview + +[PR-Agent Pro](https://www.codium.ai/pricing/) is a hosted version of PR-Agent, provided by CodiumAI. A complimentary two-week trial is offered, followed by a monthly subscription fee. +PR-Agent Pro is designed for companies and teams that require additional features and capabilities. It provides the following benefits: + +1. **Fully managed** - We take care of everything for you - hosting, models, regular updates, and more. Installation is as simple as signing up and adding the PR-Agent app to your GitHub\GitLab\BitBucket repo. + +2. **Improved privacy** - No data will be stored or used to train models. PR-Agent Pro will employ zero data retention, and will use an OpenAI and Claude accounts with zero data retention. + +3. **Improved support** - PR-Agent Pro users will receive priority support, and will be able to request new features and capabilities. + +4. **Supporting self-hosted git servers** - PR-Agent Pro can be installed on GitHub Enterprise Server, GitLab, and BitBucket. For more information, see the [installation guide](https://pr-agent-docs.codium.ai/installation/pr_agent_pro/). + +5. **PR Chat** - PR-Agent Pro allows you to engage in [private chat](https://pr-agent-docs.codium.ai/chrome-extension/features/#pr-chat) about your pull requests on private repositories. + +### Additional features + +Here are some of the additional features and capabilities that PR-Agent Pro offers: + +| Feature | Description | +|----------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [**Model selection**](https://pr-agent-docs.codium.ai/usage-guide/PR_agent_pro_models/#pr-agent-pro-models) | Choose the model that best fits your needs, among top models like `GPT4` and `Claude-Sonnet-3.5` +| [**Global and wiki configuration**](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/) | Control configurations for many repositories from a single location;
Edit configuration of a single repo without commiting code | +| [**Apply suggestions**](https://pr-agent-docs.codium.ai/tools/improve/#overview) | Generate commitable code from the relevant suggestions interactively by clicking on a checkbox | +| [**Suggestions impact**](https://pr-agent-docs.codium.ai/tools/improve/#assessing-impact) | Automatically mark suggestions that were implemented by the user (either directly in GitHub, or indirectly in the IDE) to enable tracking of the impact of the suggestions | +| [**CI feedback**](https://pr-agent-docs.codium.ai/tools/ci_feedback/) | Automatically analyze failed CI checks on GitHub and provide actionable feedback in the PR conversation, helping to resolve issues quickly | +| [**Advanced usage statistics**](https://www.codium.ai/contact/#/) | PR-Agent Pro offers detailed statistics at user, repository, and company levels, including metrics about PR-Agent usage, and also general statistics and insights | +| [**Incorporating companies' best practices**](https://pr-agent-docs.codium.ai/tools/improve/#best-practices) | Use the companies' best practices as reference to increase the effectiveness and the relevance of the code suggestions | +| [**Interactive triggering**](https://pr-agent-docs.codium.ai/tools/analyze/#example-usage) | Interactively apply different tools via the `analyze` command | +| [**SOC2 compliance check**](https://pr-agent-docs.codium.ai/tools/review/#configuration-options) | Ensures the PR contains a ticket to a project management system (e.g., Jira, Asana, Trello, etc.) +| [**Custom labels**](https://pr-agent-docs.codium.ai/tools/describe/#handle-custom-labels-from-the-repos-labels-page) | Define custom labels for PR-Agent to assign to the PR | + +### Additional tools + +Here are additional tools that are available only for PR-Agent Pro users: + +| Feature | Description | +|---------|-------------| +| [**Custom Prompt Suggestions**](https://pr-agent-docs.codium.ai/tools/custom_prompt/) | Generate code suggestions based on custom prompts from the user | +| [**Analyze PR components**](https://pr-agent-docs.codium.ai/tools/analyze/) | Identify the components that changed in the PR, and enable to interactively apply different tools to them | +| [**Tests**](https://pr-agent-docs.codium.ai/tools/test/) | Generate tests for code components that changed in the PR | +| [**PR documentation**](https://pr-agent-docs.codium.ai/tools/documentation/) | Generate docstring for code components that changed in the PR | +| [**Improve Component**](https://pr-agent-docs.codium.ai/tools/improve_component/) | Generate code suggestions for code components that changed in the PR | +| [**Similar code search**](https://pr-agent-docs.codium.ai/tools/similar_code/) | Search for similar code in the repository, organization, or entire GitHub | + + +### Supported languages + +PR-Agent Pro leverages the world's leading code models - Claude 3.5 Sonnet and GPT-4. +As a result, its primary tools such as `describe`, `review`, and `improve`, as well as the PR-chat feature, support virtually all programming languages. + +For specialized commands that require static code analysis, PR-Agent Pro offers support for specific languages. For more details about features that require static code analysis, please refer to the [documentation](https://pr-agent-docs.codium.ai/tools/analyze/#overview). \ No newline at end of file diff --git a/docs/docs/tools/analyze.md b/docs/docs/tools/analyze.md new file mode 100644 index 000000000..05458b132 --- /dev/null +++ b/docs/docs/tools/analyze.md @@ -0,0 +1,19 @@ +## Overview +The `analyze` tool combines advanced static code analysis with LLM capabilities to provide a comprehensive analysis of the PR code changes. + +The tool scans the PR code changes, finds the code components (methods, functions, classes) that changed, and enables to interactively generate tests, docs, code suggestions and similar code search for each component. + +It can be invoked manually by commenting on any PR: +``` +/analyze +``` + +## Example usage + +An example result: + +![Analyze 1](https://codium.ai/images/pr_agent/analyze_1.png){width=750} + +**Notes** + +- Language that are currently supported: Python, Java, C++, JavaScript, TypeScript, C#. \ No newline at end of file diff --git a/docs/docs/tools/ask.md b/docs/docs/tools/ask.md new file mode 100644 index 000000000..fca5b7254 --- /dev/null +++ b/docs/docs/tools/ask.md @@ -0,0 +1,59 @@ +## Overview + +The `ask` tool answers questions about the PR, based on the PR code changes. Make sure to be specific and clear in your questions. +It can be invoked manually by commenting on any PR: +``` +/ask "..." +``` + +## Example usage + +![Ask Comment](https://codium.ai/images/pr_agent/ask_comment.png){width=512} + +![Ask](https://codium.ai/images/pr_agent/ask.png){width=512} + +## Ask lines + +You can run `/ask` on specific lines of code in the PR from the PR's diff view. The tool will answer questions based on the code changes in the selected lines. +- Click on the '+' sign next to the line number to select the line. +- To select multiple lines, click on the '+' sign of the first line and then hold and drag to select the rest of the lines. +- write `/ask "..."` in the comment box and press `Add single comment` button. + +![Ask Line](https://codium.ai/images/pr_agent/Ask_line.png){width=512} + +Note that the tool does not have "memory" of previous questions, and answers each question independently. + +## Ask on images + +You can also ask questions about images that appear in the comment, where the entire PR code will be used as context. +
+The basic syntax is: +``` +/ask "..." + +[Image](https://real_link_to_image) +``` +where `https://real_link_to_image` is the direct link to the image. + +Note that GitHub has a built-in mechanism of pasting images in comments. However, pasted image does not provide a direct link. +To get a direct link to an image, we recommend using the following scheme: + +1) First, post a comment that contains **only** the image: + +![Ask image1](https://codium.ai/images/pr_agent/ask_images1.png){width=512} + +2) Quote reply to that comment: + +![Ask image2](https://codium.ai/images/pr_agent/ask_images2.png){width=512} + +3) In the screen opened, type the question below the image: + +![Ask image3](https://codium.ai/images/pr_agent/ask_images3.png){width=512} +![Ask image4](https://codium.ai/images/pr_agent/ask_images4.png){width=512} + +4) Post the comment, and receive the answer: + +![Ask image5](https://codium.ai/images/pr_agent/ask_images5.png){width=512} + + +See a full video tutorial [here](https://codium.ai/images/pr_agent/ask_image_video.mov) \ No newline at end of file diff --git a/docs/docs/tools/ci_feedback.md b/docs/docs/tools/ci_feedback.md new file mode 100644 index 000000000..10f024fa4 --- /dev/null +++ b/docs/docs/tools/ci_feedback.md @@ -0,0 +1,39 @@ +## Overview + +The CI feedback tool (`/checks)` automatically triggers when a PR has a failed check. +The tool analyzes the failed checks and provides several feedbacks: + +- Failed stage +- Failed test name +- Failure summary +- Relevant error logs + +## Example usage + +![Failed Check 1](https://www.codium.ai/images/pr_agent/failed_check1.png){width=768} + +→ +![Failed Check 2](https://www.codium.ai/images/pr_agent/failed_check2.png){width=768} + +___ + +In addition to being automatically triggered, the tool can also be invoked manually by commenting on a PR: +``` +/checks "https://github.com/{repo_name}/actions/runs/{run_number}/job/{job_number}" +``` +where `{repo_name}` is the name of the repository, `{run_number}` is the run number of the failed check, and `{job_number}` is the job number of the failed check. + +## Disabling the tool from running automatically + +If you wish to disable the tool from running automatically, you can do so by adding the following configuration to the configuration file: +``` +[checks] +enable_auto_checks_feedback = false +``` + +## Configuration options +- `enable_auto_checks_feedback` - if set to true, the tool will automatically provide feedback when a check is failed. Default is true. +- `excluded_checks_list` - a list of checks to exclude from the feedback, for example: ["check1", "check2"]. Default is an empty list. +- `persistent_comment` - if set to true, the tool will overwrite a previous checks comment with the new feedback. Default is true. +- `enable_help_text=true` - if set to true, the tool will provide a help message when a user comments "/checks" on a PR. Default is true. +- `final_update_message` - if `persistent_comment` is true and updating a previous checks message, the tool will also create a new message: "Persistent checks updated to latest commit". Default is true. diff --git a/docs/docs/tools/custom_labels.md b/docs/docs/tools/custom_labels.md new file mode 100644 index 000000000..b9d85fa2d --- /dev/null +++ b/docs/docs/tools/custom_labels.md @@ -0,0 +1,60 @@ +## Overview +The `generate_labels` tool scans the PR code changes, and given a list of labels and their descriptions, it automatically suggests labels that match the PR code changes. + +It can be invoked manually by commenting on any PR: +``` +/generate_labels +``` + +## Example usage + +If we wish to add detect changes to SQL queries in a given PR, we can add the following custom label along with its description: + +![Custom labels list](https://codium.ai/images/pr_agent/custom_labels_list.png){width=768} + +When running the `generate_labels` tool on a PR that includes changes in SQL queries, it will automatically suggest the custom label: + +![Custom labels published](https://codium.ai/images/pr_agent/custom_label_published.png){width=768} + +Note that in addition to the dedicated tool `generate_labels`, the custom labels will also be used by the `describe` tool. + +### How to enable custom labels +There are 3 ways to enable custom labels: + +#### 1. CLI (local configuration file) +When working from CLI, you need to apply the [configuration changes](#configuration-options) to the [custom_labels file](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/custom_labels.toml): + +#### 2. Repo configuration file +To enable custom labels, you need to apply the [configuration changes](#configuration-options) to the local `.pr_agent.toml` file in you repository. + +#### 3. Handle custom labels from the Repo's labels page ๐Ÿ’Ž +> This feature is available only in PR-Agent Pro + +* GitHub : `https://github.com/{owner}/{repo}/labels`, or click on the "Labels" tab in the issues or PRs page. +* GitLab : `https://gitlab.com/{owner}/{repo}/-/labels`, or click on "Manage" -> "Labels" on the left menu. + +b. Add/edit the custom labels. It should be formatted as follows: +* Label name: The name of the custom label. +* Description: Start the description of with prefix `pr_agent:`, for example: `pr_agent: Description of when AI should suggest this label`.
+The description should be comprehensive and detailed, indicating when to add the desired label. + +![Add native custom labels](https://codium.ai/images/pr_agent/add_native_custom_labels.png){width=880} + +c. Now the custom labels will be included in the `generate_labels` tool. + +> This feature is supported in GitHub and GitLab. + +## Configuration options + - Change `enable_custom_labels` to True: This will turn off the default labels and enable the custom labels provided in the custom_labels.toml file. + - Add the custom labels. It should be formatted as follows: + +``` +[config] +enable_custom_labels=true + +[custom_labels."Custom Label Name"] +description = "Description of when AI should suggest this label" + +[custom_labels."Custom Label 2"] +description = "Description of when AI should suggest this label 2" +``` \ No newline at end of file diff --git a/docs/docs/tools/custom_prompt.md b/docs/docs/tools/custom_prompt.md new file mode 100644 index 000000000..675bed695 --- /dev/null +++ b/docs/docs/tools/custom_prompt.md @@ -0,0 +1,58 @@ +## Overview +The `custom_prompt` tool scans the PR code changes, and automatically generates suggestions for improving the PR code. +It shares similarities with the `improve` tool, but with one main difference: the `custom_prompt` tool will **only propose suggestions that follow specific guidelines defined by the prompt** in: `pr_custom_prompt.prompt` configuration. + +The tool can be triggered [automatically](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened) every time a new PR is opened, or can be invoked manually by commenting on a PR. + +When commenting, use the following template: + +``` +/custom_prompt --pr_custom_prompt.prompt=" +The code suggestions should focus only on the following: +- ... +- ... + +" +``` + +With a [configuration file](../usage-guide/automations_and_usage.md#github-app), use the following template: + +``` +[pr_custom_prompt] +prompt="""\ +The suggestions should focus only on the following: +-... +-... + +""" +``` + +Remember - with this tool, you are the prompter. Be specific, clear, and concise in the instructions. Specify relevant aspects that you want the model to focus on. \ +You might benefit from several trial-and-error iterations, until you get the correct prompt for your use case. + +## Example usage + +Here is an example of a possible prompt, defined in the configuration file: +``` +[pr_custom_prompt] +prompt="""\ +The code suggestions should focus only on the following: +- look for edge cases when implementing a new function +- make sure every variable has a meaningful name +- make sure the code is efficient +""" +``` + +(The instructions above are just an example. We want to emphasize that the prompt should be specific and clear, and be tailored to the needs of your project) + +Results obtained with the prompt above: + +![Custom prompt results](https://codium.ai/images/pr_agent/custom_suggestions_result.png){width=768} + +## Configuration options + +`prompt`: the prompt for the tool. It should be a multi-line string. + +`num_code_suggestions`: number of code suggestions provided by the 'custom_prompt' tool. Default is 4. + +`enable_help_text`: if set to true, the tool will display a help text in the comment. Default is true. \ No newline at end of file diff --git a/docs/docs/tools/describe.md b/docs/docs/tools/describe.md new file mode 100644 index 000000000..9fe50e668 --- /dev/null +++ b/docs/docs/tools/describe.md @@ -0,0 +1,224 @@ +## Overview +The `describe` tool scans the PR code changes, and generates a description for the PR - title, type, summary, walkthrough and labels. + +The tool can be triggered automatically every time a new PR is [opened](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened), or it can be invoked manually by commenting on any PR: +``` +/describe +``` + +## Example usage + +### Manual triggering + +Invoke the tool manually by commenting `/describe` on any PR: + +![Describe comment](https://codium.ai/images/pr_agent/describe_comment.png){width=512} + +After ~30 seconds, the tool will generate a description for the PR: + +![Describe New](https://codium.ai/images/pr_agent/describe_new.png){width=512} + +If you want to edit [configurations](#configuration-options), add the relevant ones to the command: +``` +/describe --pr_description.some_config1=... --pr_description.some_config2=... +``` + +### Automatic triggering + +To run the `describe` automatically when a PR is opened, define in a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/#wiki-configuration-file): +``` +[github_app] +pr_commands = [ + "/describe", + ... +] + +[pr_description] +publish_labels = ... +... +``` + +- The `pr_commands` lists commands that will be executed automatically when a PR is opened. +- The `[pr_description]` section contains the configurations for the `describe` tool you want to edit (if any). + + +## Configuration options + +!!! example "Possible configurations" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publish_labelsIf set to true, the tool will publish the labels to the PR. Default is true.
publish_description_as_commentIf set to true, the tool will publish the description as a comment to the PR. If false, it will overwrite the original description. Default is false.
publish_description_as_comment_persistentIf set to true and `publish_description_as_comment` is true, the tool will publish the description as a persistent comment to the PR. Default is true.
add_original_user_descriptionIf set to true, the tool will add the original user description to the generated description. Default is true.
generate_ai_titleIf set to true, the tool will also generate an AI title for the PR. Default is false.
extra_instructionsOptional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ..."
enable_pr_typeIf set to false, it will not show the `PR type` as a text value in the description content. Default is true.
final_update_messageIf set to true, it will add a comment message [`PR Description updated to latest commit...`](https://github.com/Codium-ai/pr-agent/pull/499#issuecomment-1837412176) after finishing calling `/describe`. Default is false.
enable_semantic_files_typesIf set to true, "Changes walkthrough" section will be generated. Default is true.
collapsible_file_listIf set to true, the file list in the "Changes walkthrough" section will be collapsible. If set to "adaptive", the file list will be collapsible only if there are more than 8 files. Default is "adaptive".
enable_large_pr_handlingPro feature. If set to true, in case of a large PR the tool will make several calls to the AI and combine them to be able to cover more files. Default is true.
enable_help_textIf set to true, the tool will display a help text in the comment. Default is false.
+ + +## Inline file summary ๐Ÿ’Ž + +This feature enables you to copy the `changes walkthrough` table to the "Files changed" tab, so you can quickly understand the changes in each file while reviewing the code changes (diff view). + +To copy the `changes walkthrough` table to the "Files changed" tab, you can click on the checkbox that appears PR Description status message below the main PR Description: + +![Add table checkbox](https://codium.ai/images/pr_agent/add_table_checkbox.png){width=512} + +If you prefer to have the file summaries appear in the "Files changed" tab on every PR, change the `pr_description.inline_file_summary` parameter in the configuration file, possible values are: + +- `'table'`: File changes walkthrough table will be displayed on the top of the "Files changed" tab, in addition to the "Conversation" tab. + +![Diffview table](https://codium.ai/images/pr_agent/diffview-table.png){width=512} + +- `true`: A collapsible file comment with changes title and a changes summary for each file in the PR. + +![Diffview changes](https://codium.ai/images/pr_agent/diffview_changes.png){width=512} + +- `false` (`default`): File changes walkthrough will be added only to the "Conversation" tab. + +**Note**: that this feature is currently available only for GitHub. + + +## Markers template + +To enable markers, set `pr_description.use_description_markers=true`. +Markers enable to easily integrate user's content and auto-generated content, with a template-like mechanism. + +For example, if the PR original description was: +``` +User content... + +## PR Type: +pr_agent:type + +## PR Description: +pr_agent:summary + +## PR Walkthrough: +pr_agent:walkthrough +``` +The marker `pr_agent:type` will be replaced with the PR type, `pr_agent:summary` will be replaced with the PR summary, and `pr_agent:walkthrough` will be replaced with the PR walkthrough. + +![Describe markers before](https://codium.ai/images/pr_agent/describe_markers_before.png){width=512} + +→ + +![Describe markers after](https://codium.ai/images/pr_agent/describe_markers_after.png){width=512} + + +**Configuration params**: + +- `use_description_markers`: if set to true, the tool will use markers template. It replaces every marker of the form `pr_agent:marker_name` with the relevant content. Default is false. +- `include_generated_by_header`: if set to true, the tool will add a dedicated header: 'Generated by PR Agent at ...' to any automatic content. Default is true. + +## Custom labels +The default labels of the describe tool are quite generic, since they are meant to be used in any repo: [`Bug fix`, `Tests`, `Enhancement`, `Documentation`, `Other`]. + +You can define custom labels that are relevant for your repo and use cases. +Custom labels can be defined in a [configuration file](https://pr-agent-docs.codium.ai/tools/custom_labels/#configuration-options), or directly in the repo's [labels page](#handle-custom-labels-from-the-repos-labels-page). + +Make sure to provide proper title, and a detailed and well-phrased description for each label, so the tool will know when to suggest it. +Each label description should be a **conditional statement**, that indicates if to add the label to the PR or not, according to the PR content. + +### Handle custom labels from a configuration file +Example for a custom labels configuration setup in a configuration file: +``` +[config] +enable_custom_labels=true + + +[custom_labels."sql_changes"] +description = "Use when a PR contains changes to SQL queries" + +[custom_labels."test"] +description = "use when a PR primarily contains new tests" + +... +``` + +### Handle custom labels from the Repo's labels page ๐Ÿ’Ž + +You can also control the custom labels that will be suggested by the `describe` tool from the repo's labels page: + +* GitHub : go to `https://github.com/{owner}/{repo}/labels` (or click on the "Labels" tab in the issues or PRs page) +* GitLab : go to `https://gitlab.com/{owner}/{repo}/-/labels` (or click on "Manage" -> "Labels" on the left menu) + +Now add/edit the custom labels. they should be formatted as follows: + +* Label name: The name of the custom label. +* Description: Start the description of with prefix `pr_agent:`, for example: `pr_agent: Description of when AI should suggest this label`.
+ +Examples for custom labels: + + - `Main topic:performance` - pr_agent:The main topic of this PR is performance + - `New endpoint` - pr_agent:A new endpoint was added in this PR + - `SQL query` - pr_agent:A new SQL query was added in this PR + - `Dockerfile changes` - pr_agent:The PR contains changes in the Dockerfile + - ... + +The description should be comprehensive and detailed, indicating when to add the desired label. For example: +![Add native custom labels](https://codium.ai/images/pr_agent/add_native_custom_labels.png){width=768} + + +## Usage Tips + +!!! tip "Automation" + - When you first install PR-Agent app, the [default mode](../usage-guide/automations_and_usage.md#github-app) for the describe tool is: + ``` + pr_commands = ["/describe", ...] + ``` + meaning the `describe` tool will run automatically on every PR, with the default configurations. + + + - Markers are an alternative way to control the generated description, to give maximal control to the user. If you set: + ``` + pr_commands = ["/describe --pr_description.use_description_markers=true", ...] + ``` + the tool will replace every marker of the form `pr_agent:marker_name` in the PR description with the relevant content, where `marker_name` is one of the following: + * `type`: the PR type. + * `summary`: the PR summary. + * `walkthrough`: the PR walkthrough. + + - Note that when markers are enabled, if the original PR description does not contain any markers, the tool will not alter the description at all. diff --git a/docs/docs/tools/documentation.md b/docs/docs/tools/documentation.md new file mode 100644 index 000000000..bb1330d16 --- /dev/null +++ b/docs/docs/tools/documentation.md @@ -0,0 +1,33 @@ +## Overview +The `add_docs` tool scans the PR code changes, and automatically suggests documentation for any code components that changed in the PR (functions, classes, etc.). + +It can be invoked manually by commenting on any PR: +``` +/add_docs +``` + +## Example usage + +Invoke the tool manually by commenting `/add_docs` on any PR: + +![Docs command](https://codium.ai/images/pr_agent/docs_command.png){width=768} + +The tool will generate documentation for all the components that changed in the PR: + +![Docs component](https://codium.ai/images/pr_agent/docs_components.png){width=768} + +![Docs single component](https://codium.ai/images/pr_agent/docs_single_component.png){width=768} + +You can state a name of a specific component in the PR to get documentation only for that component: +``` +/add_docs component_name +``` + +## Configuration options + - `docs_style`: The exact style of the documentation (for python docstring). you can choose between: `google`, `numpy`, `sphinx`, `restructuredtext`, `plain`. Default is `sphinx`. + - `extra_instructions`: Optional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ...". + +**Notes** + +- Language that are currently fully supported: Python, Java, C++, JavaScript, TypeScript, C#. +- This tool can also be triggered interactively by using the [`analyze`](./analyze.md) tool. \ No newline at end of file diff --git a/docs/docs/tools/help.md b/docs/docs/tools/help.md new file mode 100644 index 000000000..5cfc1e81f --- /dev/null +++ b/docs/docs/tools/help.md @@ -0,0 +1,17 @@ +## Overview +The `help` tool provides a list of all the available tools and their descriptions. +For PR-Agent Pro users, it also enables to trigger each tool by checking the relevant box. + +It can be invoked manually by commenting on any PR: +``` +/help +``` + +## Example usage +An example [result](https://github.com/Codium-ai/pr-agent/pull/546#issuecomment-1868524805): + +![Help 1](https://codium.ai/images/pr_agent/help1.png){width=750} + +→ + +![Analyze 2](https://codium.ai/images/pr_agent/help2.png){width=750} diff --git a/docs/docs/tools/improve.md b/docs/docs/tools/improve.md new file mode 100644 index 000000000..c0ba56970 --- /dev/null +++ b/docs/docs/tools/improve.md @@ -0,0 +1,269 @@ +## Overview +The `improve` tool scans the PR code changes, and automatically generates [meaningful](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_code_suggestions_prompts.toml#L41) suggestions for improving the PR code. +The tool can be triggered automatically every time a new PR is [opened](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened), or it can be invoked manually by commenting on any PR: +``` +/improve +``` + +![code_suggestions_as_comment_closed.png](https://codium.ai/images/pr_agent/code_suggestions_as_comment_closed.png){width=512} + +![code_suggestions_as_comment_open.png](https://codium.ai/images/pr_agent/code_suggestions_as_comment_open.png){width=512} + +Note that the `Apply this suggestion` checkbox, which interactively converts a suggestion into a commitable code comment, is available only for PR-Agent Pro ๐Ÿ’Ž users. + + +## Example usage + +### Manual triggering + +Invoke the tool manually by commenting `/improve` on any PR. The code suggestions by default are presented as a single comment: + +To edit [configurations](#configuration-options) related to the improve tool, use the following template: +``` +/improve --pr_code_suggestions.some_config1=... --pr_code_suggestions.some_config2=... +``` + +For example, you can choose to present all the suggestions as commitable code comments, by running the following command: +``` +/improve --pr_code_suggestions.commitable_code_suggestions=true +``` + +![improve](https://codium.ai/images/pr_agent/improve.png){width=512} + + +As can be seen, a single table comment has a significantly smaller PR footprint. We recommend this mode for most cases. +Also note that collapsible are not supported in _Bitbucket_. Hence, the suggestions can only be presented in Bitbucket as code comments. + +### Automatic triggering + +To run the `improve` automatically when a PR is opened, define in a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/#wiki-configuration-file): +``` +[github_app] +pr_commands = [ + "/improve", + ... +] + +[pr_code_suggestions] +num_code_suggestions_per_chunk = ... +... +``` + +- The `pr_commands` lists commands that will be executed automatically when a PR is opened. +- The `[pr_code_suggestions]` section contains the configurations for the `improve` tool you want to edit (if any) + +### Assessing Impact ๐Ÿ’Ž + +Note that PR-Agent pro tracks two types of implementations: + +- Direct implementation - when the user directly applies the suggestion by clicking the `Apply` checkbox. +- Indirect implementation - when the user implements the suggestion in their IDE environment. In this case, PR-Agent will utilize, after each commit, a dedicated logic to identify if a suggestion was implemented, and will mark it as implemented. + +![code_suggestions_asses_impact](https://codium.ai/images/pr_agent/code_suggestions_asses_impact.png){width=512} + +In post-process, PR-Agent counts the number of suggestions that were implemented, and provides general statistics and insights about the suggestions' impact on the PR process. + +![code_suggestions_asses_impact_stats_1](https://codium.ai/images/pr_agent/code_suggestions_asses_impact_stats_1.png){width=512} + +![code_suggestions_asses_impact_stats_2](https://codium.ai/images/pr_agent/code_suggestions_asses_impact_stats_2.png){width=512} + + +## Usage Tips + +### Self-review +If you set in a configuration file: +``` +[pr_code_suggestions] +demand_code_suggestions_self_review = true +``` +The `improve` tool will add a checkbox below the suggestions, prompting user to acknowledge that they have reviewed the suggestions. +You can set the content of the checkbox text via: +``` +[pr_code_suggestions] +code_suggestions_self_review_text = "... (your text here) ..." +``` + +![self_review_1](https://codium.ai/images/pr_agent/self_review_1.png){width=512} + + + + +!!! tip "Tip - demanding self-review from the PR author ๐Ÿ’Ž" + + By setting: + ``` + [pr_code_suggestions] + approve_pr_on_self_review = true + ``` + the tool can automatically add an approval when the PR author clicks the self-review checkbox. + + + - If you set the number of required reviewers for a PR to 2, this effectively means that the PR author must click the self-review checkbox before the PR can be merged (in addition to a human reviewer). + + ![self_review_2](https://codium.ai/images/pr_agent/self_review_2.png){width=512} + + - If you keep the number of required reviewers for a PR to 1 and enable this configuration, this effectively means that the PR author can approve the PR by actively clicking the self-review checkbox. + + To prevent unauthorized approvals, this configuration defaults to false, and cannot be altered through online comments; enabling requires a direct update to the configuration file and a commit to the repository. This ensures that utilizing the feature demands a deliberate documented decision by the repository owner. + + +### How many code suggestions are generated? +PR-Agent uses a dynamic strategy to generate code suggestions based on the size of the pull request (PR). Here's how it works: + +1) Chunking large PRs: + +- PR-Agent divides large PRs into 'chunks'. +- Each chunk contains up to `pr_code_suggestions.max_context_tokens` tokens (default: 14,000). + + +2) Generating suggestions: + +- For each chunk, PR-Agent generates up to `pr_code_suggestions.num_code_suggestions_per_chunk` suggestions (default: 4). + + +This approach has two main benefits: + +- Scalability: The number of suggestions scales with the PR size, rather than being fixed. +- Quality: By processing smaller chunks, the AI can maintain higher quality suggestions, as larger contexts tend to decrease AI performance. + +Note: Chunking is primarily relevant for large PRs. For most PRs (up to 500 lines of code), PR-Agent will be able to process the entire code in a single call. + + +### 'Extra instructions' and 'best practices' + +#### Extra instructions + +>`Platforms supported: GitHub, GitLab, Bitbucket` + +You can use the `extra_instructions` configuration option to give the AI model additional instructions for the `improve` tool. +Be specific, clear, and concise in the instructions. With extra instructions, you are the prompter. Specify relevant aspects that you want the model to focus on. + +Examples for possible instructions: +``` +[pr_code_suggestions] +extra_instructions="""\ +(1) Answer in japanese +(2) Don't suggest to add try-excpet block +(3) Ignore changes in toml files +... +""" +``` +Use triple quotes to write multi-line instructions. Use bullet points or numbers to make the instructions more readable. + +#### Best practices ๐Ÿ’Ž + +>`Platforms supported: GitHub, GitLab` + +Another option to give additional guidance to the AI model is by creating a dedicated [**wiki page**](https://github.com/Codium-ai/pr-agent/wiki) called `best_practices.md`. +This page can contain a list of best practices, coding standards, and guidelines that are specific to your repo/organization. + +The AI model will use this wiki page as a reference, and in case the PR code violates any of the guidelines, it will suggest improvements accordingly, with a dedicated label: `Organization +best practice`. + +Example for a `best_practices.md` content can be found [here](https://github.com/Codium-ai/pr-agent/blob/main/docs/docs/usage-guide/EXAMPLE_BEST_PRACTICE.md) (adapted from Google's [pyguide](https://google.github.io/styleguide/pyguide.html)). +This file is only an example. Since it is used as a prompt for an AI model, we want to emphasize the following: + +- It should be written in a clear and concise manner +- If needed, it should give short relevant code snippets as examples +- Recommended to limit the text to 800 lines or fewer. Hereโ€™s why: + + 1) Extremely long best practices documents may not be fully processed by the AI model. + + 2) A lengthy file probably represent a more "**generic**" set of guidelines, which the AI model is already familiar with. The objective is to focus on a more targeted set of guidelines tailored to the specific needs of this project. + +##### Local and global best practices +By default, PR-Agent will look for a local `best_practices.md` wiki file in the root of the relevant local repo. + +If you want to enable also a global `best_practices.md` wiki file, set first in the global configuration file: + +``` +[best_practices] +enable_global_best_practices = true +``` + +Then, create a `best_practices.md` wiki file in the root of [global](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/#global-configuration-file) configuration repository, `pr-agent-settings`. + +##### Example results + +![best_practice](https://codium.ai/images/pr_agent/org_best_practice.png){width=512} + + +#### How to combine `extra instructions` and `best practices` + +The `extra instructions` configuration is more related to the `improve` tool prompt. It can be used, for example, to avoid specific suggestions ("Don't suggest to add try-except block", "Ignore changes in toml files", ...) or to emphasize specific aspects or formats ("Answer in Japanese", "Give only short suggestions", ...) + +In contrast, the `best_practices.md` file is a general guideline for the way code should be written in the repo. + +Using a combination of both can help the AI model to provide relevant and tailored suggestions. + +## Configuration options + +??? example "General options" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
extra_instructionsOptional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ...".
commitable_code_suggestionsIf set to true, the tool will display the suggestions as commitable code comments. Default is false.
persistent_commentIf set to true, the improve comment will be persistent, meaning that every new improve request will edit the previous one. Default is false.
self_reflect_on_suggestionsIf set to true, the improve tool will calculate an importance score for each suggestion [1-10], and sort the suggestion labels group based on this score. Default is true.
suggestions_score_threshold Any suggestion with importance score less than this threshold will be removed. Default is 0. Highly recommend not to set this value above 7-8, since above it may clip relevant suggestions that can be useful.
apply_suggestions_checkbox Enable the checkbox to create a committable suggestion. Default is true.
enable_help_textIf set to true, the tool will display a help text in the comment. Default is true.
enable_chat_textIf set to true, the tool will display a reference to the PR chat in the comment. Default is true.
+ +??? example "Params for number of suggestions and AI calls" + + + + + + + + + + + + + + + + + + +
auto_extended_modeEnable chunking the PR code and running the tool on each chunk. Default is true.
num_code_suggestions_per_chunkNumber of code suggestions provided by the 'improve' tool, per chunk. Default is 4.
max_number_of_callsMaximum number of chunks. Default is 3.
rank_extended_suggestionsIf set to true, the tool will rank the suggestions, based on importance. Default is true.
+ +## A note on code suggestions quality + +- AI models for code are getting better and better (Sonnet-3.5 and GPT-4), but they are not flawless. Not all the suggestions will be perfect, and a user should not accept all of them automatically. Critical reading and judgment are required. +- While mistakes of the AI are rare but can happen, a real benefit from the suggestions of the `improve` (and [`review`](https://pr-agent-docs.codium.ai/tools/review/)) tool is to catch, with high probability, **mistakes or bugs done by the PR author**, when they happen. So, it's a good practice to spend the needed ~30-60 seconds to review the suggestions, even if not all of them are always relevant. +- The hierarchical structure of the suggestions is designed to help the user to _quickly_ understand them, and to decide which ones are relevant and which are not: + + - Only if the `Category` header is relevant, the user should move to the summarized suggestion description + - Only if the summarized suggestion description is relevant, the user should click on the collapsible, to read the full suggestion description with a code preview example. + +- In addition, we recommend to use the [`extra_instructions`](https://pr-agent-docs.codium.ai/tools/improve/#extra-instructions-and-best-practices) field to guide the model to suggestions that are more relevant to the specific needs of the project. +- The interactive [PR chat](https://pr-agent-docs.codium.ai/chrome-extension/) also provides an easy way to get more tailored suggestions and feedback from the AI model. diff --git a/docs/docs/tools/improve_component.md b/docs/docs/tools/improve_component.md new file mode 100644 index 000000000..4e0c88902 --- /dev/null +++ b/docs/docs/tools/improve_component.md @@ -0,0 +1,29 @@ +## Overview +The `improve_component` tool generates code suggestions for a specific code component that changed in the PR. +it can be invoked manually by commenting on any PR: +``` +/improve_component component_name +``` + +To get a list of the components that changed in the PR and choose the relevant component interactively, use the [`analyze`](./analyze.md) tool. + + +## Example usage + +Invoke the tool manually by commenting `/improve_component` on any PR: + +![improve_component1](https://codium.ai/images/pr_agent/improve_component1.png){width=768} + +The tool will generate code suggestions for the selected component (if no component is stated, it will generate code suggestions for the largest component): + +![improve_component2](https://codium.ai/images/pr_agent/improve_component2.png){width=768} + +**Notes** +- Language that are currently supported by the tool: Python, Java, C++, JavaScript, TypeScript, C#. +- This tool can also be triggered interactively by using the [`analyze`](./analyze.md) tool. + +## Configuration options +- `num_code_suggestions`: number of code suggestions to provide. Default is 4 +- `extra_instructions`: Optional extra instructions to the tool. For example: "focus on ...". +- `file`: in case there are several components with the same name, you can specify the relevant file. +- `class_name`: in case there are several methods with the same name in the same file, you can specify the relevant class name. \ No newline at end of file diff --git a/docs/docs/tools/index.md b/docs/docs/tools/index.md new file mode 100644 index 000000000..10aaf88ab --- /dev/null +++ b/docs/docs/tools/index.md @@ -0,0 +1,22 @@ +# Tools + +Here is a list of PR-Agent tools, each with a dedicated page that explains how to use it: + +| Tool | Description | +|------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------| +| **[PR Description (`/describe`](./describe.md))** | Automatically generating PR description - title, type, summary, code walkthrough and labels | +| **[PR Review (`/review`](./review.md))** | Adjustable feedback about the PR, possible issues, security concerns, review effort and more | +| **[Code Suggestions (`/improve`](./improve.md))** | Code suggestions for improving the PR | +| **[Question Answering (`/ask ...`](./ask.md))** | Answering free-text questions about the PR, or on specific code lines | +| **[Update Changelog (`/update_changelog`](./update_changelog.md))** | Automatically updating the CHANGELOG.md file with the PR changes | +| **[Find Similar Issue (`/similar_issue`](./similar_issues.md))** | Automatically retrieves and presents similar issues | +| **[Help (`/help`](./help.md))** | Provides a list of all the available tools. Also enables to trigger them interactively (๐Ÿ’Ž) | +| **๐Ÿ’Ž [Add Documentation (`/add_docs`](./documentation.md))** | Generates documentation to methods/functions/classes that changed in the PR | +| **๐Ÿ’Ž [Generate Custom Labels (`/generate_labels`](./custom_labels.md))** | Generates custom labels for the PR, based on specific guidelines defined by the user | +| **๐Ÿ’Ž [Analyze (`/analyze`](./analyze.md))** | Identify code components that changed in the PR, and enables to interactively generate tests, docs, and code suggestions for each component | +| **๐Ÿ’Ž [Custom Prompt (`/custom_prompt`](./custom_prompt.md))** | Automatically generates custom suggestions for improving the PR code, based on specific guidelines defined by the user | +| **๐Ÿ’Ž [Generate Tests (`/test component_name`](./test.md))** | Automatically generates unit tests for a selected component, based on the PR code changes | +| **๐Ÿ’Ž [Improve Component (`/improve_component component_name`](./improve_component.md))** | Generates code suggestions for a specific code component that changed in the PR | +| **๐Ÿ’Ž [CI Feedback (`/checks ci_job`](./ci_feedback.md))** | Automatically generates feedback and analysis for a failed CI job | + +Note that the tools marked with ๐Ÿ’Ž are available only for PR-Agent Pro users. \ No newline at end of file diff --git a/docs/docs/tools/review.md b/docs/docs/tools/review.md new file mode 100644 index 000000000..27af7d93d --- /dev/null +++ b/docs/docs/tools/review.md @@ -0,0 +1,272 @@ +## Overview +The `review` tool scans the PR code changes, and generates a list of feedbacks about the PR, aiming to aid the reviewing process. +
+The tool can be triggered automatically every time a new PR is [opened](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened), or can be invoked manually by commenting on any PR: +``` +/review +``` + +Note that the main purpose of the `review` tool is to provide the **PR reviewer** with useful feedbacks and insights. The PR author, in contrast, may prefer to save time and focus on the output of the [improve](./improve.md) tool, which provides actionable code suggestions. + +(Read more about the different personas in the PR process and how PR-Agent aims to assist them in our [blog](https://www.codium.ai/blog/understanding-the-challenges-and-pain-points-of-the-pull-request-cycle/)) + + +## Example usage + +### Manual triggering + +Invoke the tool manually by commenting `/review` on any PR: + +![review comment](https://codium.ai/images/pr_agent/review_comment.png){width=512} + +After ~30 seconds, the tool will generate a review for the PR: + +![review](https://codium.ai/images/pr_agent/review3.png){width=512} + +If you want to edit [configurations](#configuration-options), add the relevant ones to the command: +``` +/review --pr_reviewer.some_config1=... --pr_reviewer.some_config2=... +``` + +### Automatic triggering + +To run the `review` automatically when a PR is opened, define in a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/#wiki-configuration-file): +``` +[github_app] +pr_commands = [ + "/review", + ... +] + +[pr_reviewer] +num_code_suggestions = ... +... +``` + +- The `pr_commands` lists commands that will be executed automatically when a PR is opened. +- The `[pr_reviewer]` section contains the configurations for the `review` tool you want to edit (if any). + +[//]: # () +[//]: # (### Incremental Mode) + +[//]: # (Incremental review only considers changes since the last PR-Agent review. This can be useful when working on the PR in an iterative manner, and you want to focus on the changes since the last review instead of reviewing the entire PR again.) + +[//]: # (For invoking the incremental mode, the following command can be used:) + +[//]: # (```) + +[//]: # (/review -i) + +[//]: # (```) + +[//]: # (Note that the incremental mode is only available for GitHub.) + +[//]: # () +[//]: # (![incremental review](https://codium.ai/images/pr_agent/incremental_review_2.png){width=512}) + +[//]: # (### PR Reflection) + +[//]: # () +[//]: # (By invoking:) + +[//]: # (```) + +[//]: # (/reflect_and_review) + +[//]: # (```) + +[//]: # (The tool will first ask the author questions about the PR, and will guide the review based on their answers.) + +[//]: # () +[//]: # (![reflection questions](https://codium.ai/images/pr_agent/reflection_questions.png){width=512}) + +[//]: # () +[//]: # (![reflection answers](https://codium.ai/images/pr_agent/reflection_answers.png){width=512}) + +[//]: # () +[//]: # (![reflection insights](https://codium.ai/images/pr_agent/reflection_insights.png){width=512}) + + + +## Configuration options + +!!! example "General options" + + + + + + + + + + + + + + + + + + + + + + +
num_code_suggestionsNumber of code suggestions provided by the 'review' tool. Default is 0, meaning no code suggestions will be provided by the `review` tool.
inline_code_commentsIf set to true, the tool will publish the code suggestions as comments on the code diff. Default is false. Note that you need to set `num_code_suggestions`>0 to get code suggestions
persistent_commentIf set to true, the review comment will be persistent, meaning that every new review request will edit the previous one. Default is true.
extra_instructionsOptional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ...".
enable_help_textIf set to true, the tool will display a help text in the comment. Default is true.
+ +!!! example "Enable\\disable specific sub-sections" + + + + + + + + + + + + + + + + + + + + + + +
require_score_reviewIf set to true, the tool will add a section that scores the PR. Default is false.
require_tests_reviewIf set to true, the tool will add a section that checks if the PR contains tests. Default is true.
require_estimate_effort_to_reviewIf set to true, the tool will add a section that estimates the effort needed to review the PR. Default is true.
require_can_be_split_reviewIf set to true, the tool will add a section that checks if the PR contains several themes, and can be split into smaller PRs. Default is false.
require_security_reviewIf set to true, the tool will add a section that checks if the PR contains a possible security or vulnerability issue. Default is true.
+ +!!! example "SOC2 ticket compliance ๐Ÿ’Ž" + +This sub-tool checks if the PR description properly contains a ticket to a project management system (e.g., Jira, Asana, Trello, etc.), as required by SOC2 compliance. If not, it will add a label to the PR: "Missing SOC2 ticket". + + + + + + + + + + +
require_soc2_ticketIf set to true, the SOC2 ticket checker sub-tool will be enabled. Default is false.
soc2_ticket_promptThe prompt for the SOC2 ticket review. Default is: `Does the PR description include a link to ticket in a project management system (e.g., Jira, Asana, Trello, etc.) ?`. Edit this field if your compliance requirements are different.
+ +!!! example "Adding PR labels" + +You can enable\disable the `review` tool to add specific labels to the PR: + + + + + + + + + + +
enable_review_labels_securityIf set to true, the tool will publish a 'possible security issue' label if it detects a security issue. Default is true.
enable_review_labels_effortIf set to true, the tool will publish a 'Review effort [1-5]: x' label. Default is true.
+ +!!! example "Auto-approval" + +If enabled, the `review` tool can approve a PR when a specific comment, `/review auto_approve`, is invoked. + + + + + + + + + + +
enable_auto_approvalIf set to true, the tool will approve the PR when invoked with the 'auto_approve' command. Default is false. This flag can be changed only from a configuration file.
maximal_review_effortMaximal effort level for auto-approval. If the PR's estimated review effort is above this threshold, the auto-approval will not run. Default is 5.
+ +## Usage Tips + +!!! tip "General guidelines" + + The `review` tool provides a collection of configurable feedbacks about a PR. + It is recommended to review the [Configuration options](#configuration-options) section, and choose the relevant options for your use case. + + Some of the features that are disabled by default are quite useful, and should be considered for enabling. For example: + `require_score_review`, `require_soc2_ticket`, and more. + + On the other hand, if you find one of the enabled features to be irrelevant for your use case, disable it. No default configuration can fit all use cases. + +!!! tip "Automation" + When you first install PR-Agent app, the [default mode](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened) for the `review` tool is: + ``` + pr_commands = ["/review --pr_reviewer.num_code_suggestions=0", ...] + ``` + Meaning the `review` tool will run automatically on every PR, without providing code suggestions. + Edit this field to enable/disable the tool, or to change the configurations used. + +!!! tip "Possible labels from the review tool" + + The `review` tool can auto-generate two specific types of labels for a PR: + + - a `possible security issue` label that detects if a possible [security issue](https://github.com/Codium-ai/pr-agent/blob/tr/user_description/pr_agent/settings/pr_reviewer_prompts.toml#L136) exists in the PR code (`enable_review_labels_security` flag) + - a `Review effort [1-5]: x` label, where x is the estimated effort to review the PR (`enable_review_labels_effort` flag) + + Both modes are useful, and we recommended to enable them. + +!!! tip "Extra instructions" + + Extra instructions are important. + The `review` tool can be configured with extra instructions, which can be used to guide the model to a feedback tailored to the needs of your project. + + Be specific, clear, and concise in the instructions. With extra instructions, you are the prompter. Specify the relevant sub-tool, and the relevant aspects of the PR that you want to emphasize. + + Examples of extra instructions: + ``` + [pr_reviewer] + extra_instructions="""\ + In the code feedback section, emphasize the following: + - Does the code logic cover relevant edge cases? + - Is the code logic clear and easy to understand? + - Is the code logic efficient? + ... + """ + ``` + Use triple quotes to write multi-line instructions. Use bullet points to make the instructions more readable. + + +!!! tip "Auto-approval" + + PR-Agent can approve a PR when a specific comment is invoked. + + To ensure safety, the auto-approval feature is disabled by default. To enable auto-approval, you need to actively set in a pre-defined configuration file the following: + ``` + [pr_reviewer] + enable_auto_approval = true + ``` + (this specific flag cannot be set with a command line argument, only in the configuration file, committed to the repository) + + + After enabling, by commenting on a PR: + ``` + /review auto_approve + ``` + PR-Agent will automatically approve the PR, and add a comment with the approval. + + + You can also enable auto-approval only if the PR meets certain requirements, such as that the `estimated_review_effort` label is equal or below a certain threshold, by adjusting the flag: + ``` + [pr_reviewer] + maximal_review_effort = 5 + ``` + +[//]: # (!!! tip "Code suggestions") + +[//]: # () +[//]: # ( If you set `num_code_suggestions`>0 , the `review` tool will also provide code suggestions.) + +[//]: # ( ) +[//]: # ( Notice If you are interested **only** in the code suggestions, it is recommended to use the [`improve`](./improve.md) feature instead, since it is a dedicated only to code suggestions, and usually gives better results.) + +[//]: # ( Use the `review` tool if you want to get more comprehensive feedback, which includes code suggestions as well.) + diff --git a/docs/docs/tools/similar_code.md b/docs/docs/tools/similar_code.md new file mode 100644 index 000000000..5f2af5c8b --- /dev/null +++ b/docs/docs/tools/similar_code.md @@ -0,0 +1,63 @@ +## Overview +The similar code tool retrieves the most similar code components from inside the organization's codebase, or from open-source code. + +For example: + +`Global Search` for a method called `chat_completion`: + +![similar code global](https://codium.ai/images/pr_agent/similar_code_global2.png){width=768} + + +PR-Agent will examine the code component and will extract the most relevant keywords to search for similar code: + +- `extracted keywords`: the keywords that were extracted from the code by PR-Agent. the link will open a search page with the extracted keywords, to allow the user to modify the search if needed. +- `search context`: the context in which the search will be performed, organization's codebase or open-source code (Global). +- `similar code`: the most similar code components found. the link will open the code component in the relevant file. +- `relevant repositories`: the open-source repositories in which that are relevant to the searched code component and it's keywords. + +Search result link example: + +![code search result single](https://codium.ai/images/pr_agent/code_search_result_single.png){width=768} + + +`Organization Search`: + +![similar code org](https://codium.ai/images/pr_agent/similar_code_org.png){width=768} + + +## How to use +### Manually +To invoke the `similar code` tool manually, comment on the PR: +``` +/find_similar_component COMPONENT_NAME +``` +Where `COMPONENT_NAME` should be the name of a code component in the PR (class, method, function). + +If there is a name ambiguity, there are two configurations that will help the tool to find the correct component: + +- `--pr_find_similar_component.file`: in case there are several components with the same name, you can specify the relevant file. +- `--pr_find_similar_component.class_name`: in case there are several methods with the same name in the same file, you can specify the relevant class name. + +example: +``` +/find_similar_component COMPONENT_NAME --pr_find_similar_component.file=FILE_NAME +``` + +### Automatically (via Analyze table) +It can be invoked automatically from the analyze table, can be accessed by: +``` +/analyze +``` +Choose the components you want to find similar code for, and click on the `similar` checkbox. +![analyze similar](https://codium.ai/images/pr_agent/analyze_similar.png){width=768} + +If you are looking to search for similar code in the organization's codebase, you can click on the `Organization` checkbox, and it will invoke a new search command just for the organization's codebase. + +![similar code global](https://codium.ai/images/pr_agent/similar_code_global.png){width=768} + + +## Configuration options + +- `search_from_org`: if set to true, the tool will search for similar code in the organization's codebase. Default is false. +- `number_of_keywords`: number of keywords to use for the search. Default is 5. +- `number_of_results`: the maximum number of results to present. Default is 5. diff --git a/docs/docs/tools/similar_issues.md b/docs/docs/tools/similar_issues.md new file mode 100644 index 000000000..4f0351e0d --- /dev/null +++ b/docs/docs/tools/similar_issues.md @@ -0,0 +1,43 @@ +## Overview +The similar issue tool retrieves the most similar issues to the current issue. +It can be invoked manually by commenting on any PR: +``` +/similar_issue +``` + + +## Example usage + +![similar_issue_original_issue](https://codium.ai/images/pr_agent/similar_issue_original_issue.png){width=768} + +![similar_issue_comment](https://codium.ai/images/pr_agent/similar_issue_comment.png){width=768} + +![similar_issue](https://codium.ai/images/pr_agent/similar_issue.png){width=768} + +Note that to perform retrieval, the `similar_issue` tool indexes all the repo previous issues (once). + + +**Select VectorDBs** by changing `pr_similar_issue` parameter in `configuration.toml` file + +2 VectorDBs are available to switch in +1. LanceDB +2. Pinecone + +To enable usage of the '**similar issue**' tool for Pinecone, you need to set the following keys in `.secrets.toml` (or in the relevant environment variables): + +``` +[pinecone] +api_key = "..." +environment = "..." +``` +These parameters can be obtained by registering to [Pinecone](https://app.pinecone.io/?sessionType=signup/). + + +## How to use +- To invoke the 'similar issue' tool from **CLI**, run: +`python3 cli.py --issue_url=... similar_issue` + +- To invoke the 'similar' issue tool via online usage, [comment](https://github.com/Codium-ai/pr-agent/issues/178#issuecomment-1716934893) on a PR: +`/similar_issue` + +- You can also enable the 'similar issue' tool to run automatically when a new issue is opened, by adding it to the [pr_commands list in the github_app section](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L66) \ No newline at end of file diff --git a/docs/docs/tools/test.md b/docs/docs/tools/test.md new file mode 100644 index 000000000..b5a143f06 --- /dev/null +++ b/docs/docs/tools/test.md @@ -0,0 +1,32 @@ +## Overview +By combining LLM abilities with static code analysis, the `test` tool generate tests for a selected component, based on the PR code changes. +It can be invoked manually by commenting on any PR: +``` +/test component_name +``` +where 'component_name' is the name of a specific component in the PR. +To get a list of the components that changed in the PR and choose the relevant component interactively, use the [`analyze`](./analyze.md) tool. + +## Example usage + +Invoke the tool manually by commenting `/test` on any PR: +The tool will generate tests for the selected component (if no component is stated, it will generate tests for largest component): + +![test1](https://codium.ai/images/pr_agent/test1.png){width=768} + + +(Example taken from [here](https://github.com/Codium-ai/pr-agent/pull/598#issuecomment-1913679429)): + +**Notes** +- Language that are currently supported by the tool: Python, Java, C++, JavaScript, TypeScript, C#. +- This tool can also be triggered interactively by using the [`analyze`](./analyze.md) tool. + + +## Configuration options +- `num_tests`: number of tests to generate. Default is 3. +- `testing_framework`: the testing framework to use. If not set, for Python it will use `pytest`, for Java it will use `JUnit`, for C++ it will use `Catch2`, and for JavaScript and TypeScript it will use `jest`. +- `avoid_mocks`: if set to true, the tool will try to avoid using mocks in the generated tests. Note that even if this option is set to true, the tool might still use mocks if it cannot generate a test without them. Default is true. +- `extra_instructions`: Optional extra instructions to the tool. For example: "use the following mock injection scheme: ...". +- `file`: in case there are several components with the same name, you can specify the relevant file. +- `class_name`: in case there are several methods with the same name in the same file, you can specify the relevant class name. +- `enable_help_text`: if set to true, the tool will add a help text to the PR comment. Default is true. \ No newline at end of file diff --git a/docs/docs/tools/update_changelog.md b/docs/docs/tools/update_changelog.md new file mode 100644 index 000000000..0c1bfda55 --- /dev/null +++ b/docs/docs/tools/update_changelog.md @@ -0,0 +1,19 @@ +## Overview +The `update_changelog` tool automatically updates the CHANGELOG.md file with the PR changes. +It can be invoked manually by commenting on any PR: +``` +/update_changelog +``` + +## Example usage + +![update_changelog_comment](https://codium.ai/images/pr_agent/update_changelog_comment.png){width=768} + +![update_changelog](https://codium.ai/images/pr_agent/update_changelog.png){width=768} + +## Configuration options + +Under the section `pr_update_changelog`, the [configuration file](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L50) contains options to customize the 'update changelog' tool: + +- `push_changelog_changes`: whether to push the changes to CHANGELOG.md, or just print them. Default is false (print only). +- `extra_instructions`: Optional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ... \ No newline at end of file diff --git a/docs/docs/usage-guide/EXAMPLE_BEST_PRACTICE.md b/docs/docs/usage-guide/EXAMPLE_BEST_PRACTICE.md new file mode 100644 index 000000000..0ec886a65 --- /dev/null +++ b/docs/docs/usage-guide/EXAMPLE_BEST_PRACTICE.md @@ -0,0 +1,189 @@ +## Recommend Python Best Practices +This document outlines a series of recommended best practices for Python development. These guidelines aim to improve code quality, maintainability, and readability. + +### Imports + +Use `import` statements for packages and modules only, not for individual types, classes, or functions. + +#### Definition + +Reusability mechanism for sharing code from one module to another. + +#### Decision + +- Use `import x` for importing packages and modules. +- Use `from x import y` where `x` is the package prefix and `y` is the module name with no prefix. +- Use `from x import y as z` in any of the following circumstances: + - Two modules named `y` are to be imported. + - `y` conflicts with a top-level name defined in the current module. + - `y` conflicts with a common parameter name that is part of the public API (e.g., `features`). + - `y` is an inconveniently long name, or too generic in the context of your code +- Use `import y as z` only when `z` is a standard abbreviation (e.g., `import numpy as np`). + +For example the module `sound.effects.echo` may be imported as follows: + +``` +from sound.effects import echo +... +echo.EchoFilter(input, output, delay=0.7, atten=4) + +``` + +Do not use relative names in imports. Even if the module is in the same package, use the full package name. This helps prevent unintentionally importing a package twice. + +##### Exemptions + +Exemptions from this rule: + +- Symbols from the following modules are used to support static analysis and type checking: + - [`typing` module](https://google.github.io/styleguide/pyguide.html#typing-imports) + - [`collections.abc` module](https://google.github.io/styleguide/pyguide.html#typing-imports) + - [`typing_extensions` module](https://github.com/python/typing_extensions/blob/main/README.md) +- Redirects from the [six.moves module](https://six.readthedocs.io/#module-six.moves). + +### Packages + +Import each module using the full pathname location of the module. + +#### Decision + +All new code should import each module by its full package name. + +Imports should be as follows: + +``` +Yes: + # Reference absl.flags in code with the complete name (verbose). + import absl.flags + from doctor.who import jodie + + _FOO = absl.flags.DEFINE_string(...) + +``` + +``` +Yes: + # Reference flags in code with just the module name (common). + from absl import flags + from doctor.who import jodie + + _FOO = flags.DEFINE_string(...) + +``` + +_(assume this file lives in `doctor/who/` where `jodie.py` also exists)_ + +``` +No: + # Unclear what module the author wanted and what will be imported. The actual + # import behavior depends on external factors controlling sys.path. + # Which possible jodie module did the author intend to import? + import jodie + +``` + +The directory the main binary is located in should not be assumed to be in `sys.path` despite that happening in some environments. This being the case, code should assume that `import jodie` refers to a third-party or top-level package named `jodie`, not a local `jodie.py`. + +### Default Iterators and Operators +Use default iterators and operators for types that support them, like lists, dictionaries, and files. + +#### Definition + +Container types, like dictionaries and lists, define default iterators and membership test operators (โ€œinโ€ and โ€œnot inโ€). + +#### Decision + +Use default iterators and operators for types that support them, like lists, dictionaries, and files. The built-in types define iterator methods, too. Prefer these methods to methods that return lists, except that you should not mutate a container while iterating over it. + +``` +Yes: for key in adict: ... + if obj in alist: ... + for line in afile: ... + for k, v in adict.items(): ... +``` + +``` +No: for key in adict.keys(): ... + for line in afile.readlines(): ... +``` + +### Lambda Functions + +Okay for one-liners. Prefer generator expressions over `map()` or `filter()` with a `lambda`. + +#### Decision + +Lambdas are allowed. If the code inside the lambda function spans multiple lines or is longer than 60-80 chars, it might be better to define it as a regular [nested function](https://google.github.io/styleguide/pyguide.html#lexical-scoping). + +For common operations like multiplication, use the functions from the `operator` module instead of lambda functions. For example, prefer `operator.mul` to `lambda x, y: x * y`. + +### Default Argument Values + +Okay in most cases. + +#### Definition + +You can specify values for variables at the end of a functionโ€™s parameter list, e.g., `def foo(a, b=0):`. If `foo` is called with only one argument, `b` is set to 0. If it is called with two arguments, `b` has the value of the second argument. + +#### Decision + +Okay to use with the following caveat: + +Do not use mutable objects as default values in the function or method definition. + +``` +Yes: def foo(a, b=None): + if b is None: + b = [] +Yes: def foo(a, b: Sequence | None = None): + if b is None: + b = [] +Yes: def foo(a, b: Sequence = ()): # Empty tuple OK since tuples are immutable. + ... +``` + +``` +from absl import flags +_FOO = flags.DEFINE_string(...) + +No: def foo(a, b=[]): + ... +No: def foo(a, b=time.time()): # Is `b` supposed to represent when this module was loaded? + ... +No: def foo(a, b=_FOO.value): # sys.argv has not yet been parsed... + ... +No: def foo(a, b: Mapping = {}): # Could still get passed to unchecked code. + ... +``` + +### True/False Evaluations + + +Use the โ€œimplicitโ€ false if possible, e.g., `if foo:` rather than `if foo != []:` + +### Lexical Scoping + +Okay to use. + +An example of the use of this feature is: + +``` +def get_adder(summand1: float) -> Callable[[float], float]: + """Returns a function that adds numbers to a given number.""" + def adder(summand2: float) -> float: + return summand1 + summand2 + + return adder +``` +#### Decision + +Okay to use. + + +### Threading + +Do not rely on the atomicity of built-in types. + +While Pythonโ€™s built-in data types such as dictionaries appear to have atomic operations, there are corner cases where they arenโ€™t atomic (e.g. if `__hash__` or `__eq__` are implemented as Python methods) and their atomicity should not be relied upon. Neither should you rely on atomic variable assignment (since this in turn depends on dictionaries). + +Use the `queue` moduleโ€™s `Queue` data type as the preferred way to communicate data between threads. Otherwise, use the `threading` module and its locking primitives. Prefer condition variables and `threading.Condition` instead of using lower-level locks. \ No newline at end of file diff --git a/docs/docs/usage-guide/PR_agent_pro_models.md b/docs/docs/usage-guide/PR_agent_pro_models.md new file mode 100644 index 000000000..c707a7cc1 --- /dev/null +++ b/docs/docs/usage-guide/PR_agent_pro_models.md @@ -0,0 +1,18 @@ +## PR-Agent Pro Models + +The default models used by PR-Agent Pro are a combination of Claude-3.5-sonnet and OpenAI's GPT-4 models. + +Users can configure PR-Agent Pro to use solely a specific model by editing the [configuration](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/) file. + +For example, to restrict PR-Agent Pro to using only `Claude-3.5-sonnet`, add this setting: + +``` +[config] +model="claude-3-5-sonnet" +``` + +Or to restrict PR-Agent Pro to using only `GPT-4o`, add this setting: +``` +[config] +model="gpt-4o" +``` diff --git a/docs/docs/usage-guide/additional_configurations.md b/docs/docs/usage-guide/additional_configurations.md new file mode 100644 index 000000000..78ca4113c --- /dev/null +++ b/docs/docs/usage-guide/additional_configurations.md @@ -0,0 +1,162 @@ +## Show possible configurations +The possible configurations of pr-agent are stored in [here](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml). +In the [tools](https://pr-agent-docs.codium.ai/tools/) page you can find explanations on how to use these configurations for each tool. + +To print all the available configurations as a comment on your PR, you can use the following command: +``` +/config +``` + +![possible_config1](https://codium.ai/images/pr_agent/possible_config1.png){width=512} + + +To view the **actual** configurations used for a specific tool, after all the user settings are applied, you can add for each tool a `--config.output_relevant_configurations=true` suffix. +For example: +``` +/improve --config.output_relevant_configurations=true +``` +Will output an additional field showing the actual configurations used for the `improve` tool. + +![possible_config2](https://codium.ai/images/pr_agent/possible_config2.png){width=512} + + +## Ignoring files from analysis + +In some cases, you may want to exclude specific files or directories from the analysis performed by CodiumAI PR-Agent. This can be useful, for example, when you have files that are generated automatically or files that shouldn't be reviewed, like vendor code. + +You can ignore files or folders using the following methods: + - `IGNORE.GLOB` + - `IGNORE.REGEX` + +which you can edit to ignore files or folders based on glob or regex patterns. + +### Example usage + +Let's look at an example where we want to ignore all files with `.py` extension from the analysis. + +To ignore Python files in a PR with online usage, comment on a PR: +`/review --ignore.glob="['*.py']"` + + +To ignore Python files in all PRs using `glob` pattern, set in a configuration file: +``` +[ignore] +glob = ['*.py'] +``` + +And to ignore Python files in all PRs using `regex` pattern, set in a configuration file: +``` +[regex] +regex = ['.*\.py$'] +``` + +## Extra instructions + +All PR-Agent tools have a parameter called `extra_instructions`, that enables to add free-text extra instructions. Example usage: +``` +/update_changelog --pr_update_changelog.extra_instructions="Make sure to update also the version ..." +``` + +## Working with large PRs + +The default mode of CodiumAI is to have a single call per tool, using GPT-4, which has a token limit of 8000 tokens. +This mode provides a very good speed-quality-cost tradeoff, and can handle most PRs successfully. +When the PR is above the token limit, it employs a [PR Compression strategy](../core-abilities/index.md). + +However, for very large PRs, or in case you want to emphasize quality over speed and cost, there are two possible solutions: +1) [Use a model](https://codium-ai.github.io/Docs-PR-Agent/usage-guide/#changing-a-model) with larger context, like GPT-32K, or claude-100K. This solution will be applicable for all the tools. +2) For the `/improve` tool, there is an ['extended' mode](https://codium-ai.github.io/Docs-PR-Agent/tools/#improve) (`/improve --extended`), +which divides the PR into chunks, and processes each chunk separately. With this mode, regardless of the model, no compression will be done (but for large PRs, multiple model calls may occur) + + + +## Patch Extra Lines + +By default, around any change in your PR, git patch provides three lines of context above and below the change. +``` +@@ -12,5 +12,5 @@ def func1(): + code line that already existed in the file... + code line that already existed in the file... + code line that already existed in the file.... +-code line that was removed in the PR ++new code line added in the PR + code line that already existed in the file... + code line that already existed in the file... + code line that already existed in the file... +``` + +PR-Agent will try to increase the number of lines of context, via the parameter: +``` +[config] +patch_extra_lines_before=3 +patch_extra_lines_after=1 +``` + +Increasing this number provides more context to the model, but will also increase the token budget, and may overwhelm the model with too much information, unrelated to the actual PR code changes. + +If the PR is too large (see [PR Compression strategy](https://github.com/Codium-ai/pr-agent/blob/main/PR_COMPRESSION.md)), PR-Agent may automatically set this number to 0, and will use the original git patch. + + +## Editing the prompts + +The prompts for the various PR-Agent tools are defined in the `pr_agent/settings` folder. +In practice, the prompts are loaded and stored as a standard setting object. +Hence, editing them is similar to editing any other configuration value - just place the relevant key in `.pr_agent.toml`file, and override the default value. + +For example, if you want to edit the prompts of the [describe](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_description_prompts.toml) tool, you can add the following to your `.pr_agent.toml` file: +``` +[pr_description_prompt] +system=""" +... +""" +user=""" +... +""" +``` +Note that the new prompt will need to generate an output compatible with the relevant [post-process function](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/tools/pr_description.py#L137). + +## Integrating with Logging Observability Platforms + +Various logging observability tools can be used out-of-the box when using the default LiteLLM AI Handler. Simply configure the LiteLLM callback settings in `configuration.toml` and set environment variables according to the LiteLLM [documentation](https://docs.litellm.ai/docs/). + +For example, to use [LangSmith](https://www.langchain.com/langsmith) you can add the following to your `configuration.toml` file: +``` +[litellm] +enable_callbacks = true +success_callback = ["langsmith"] +failure_callback = ["langsmith"] +service_callback = [] +``` + +Then set the following environment variables: + +``` +LANGSMITH_API_KEY= +LANGSMITH_PROJECT= +LANGSMITH_BASE_URL= +``` + +## Ignoring automatic commands in PRs + +In some cases, you may want to automatically ignore specific PRs . PR-Agent enables you to ignore PR with a specific title, or from/to specific branches (regex matching). + +To ignore PRs with a specific title such as "[Bump]: ...", you can add the following to your `configuration.toml` file: + +``` +[config] +ignore_pr_title = ["\\[Bump\\]"] +``` + +Where the `ignore_pr_title` is a list of regex patterns to match the PR title you want to ignore. Default is `ignore_pr_title = ["^\\[Auto\\]", "^Auto"]`. + + +To ignore PRs from specific source or target branches, you can add the following to your `configuration.toml` file: + +``` +[config] +ignore_pr_source_branches = ['develop', 'main', 'master', 'stage'] +ignore_pr_target_branches = ["qa"] +``` + +Where the `ignore_pr_source_branches` and `ignore_pr_target_branches` are lists of regex patterns to match the source and target branches you want to ignore. +They are not mutually exclusive, you can use them together or separately. diff --git a/docs/docs/usage-guide/automations_and_usage.md b/docs/docs/usage-guide/automations_and_usage.md new file mode 100644 index 000000000..0f780d80d --- /dev/null +++ b/docs/docs/usage-guide/automations_and_usage.md @@ -0,0 +1,236 @@ +## Local repo (CLI) +When running from your locally cloned PR-Agent repo (CLI), your local configuration file will be used. +Examples of invoking the different tools via the CLI: + +- **Review**: `python -m pr_agent.cli --pr_url= review` +- **Describe**: `python -m pr_agent.cli --pr_url= describe` +- **Improve**: `python -m pr_agent.cli --pr_url= improve` +- **Ask**: `python -m pr_agent.cli --pr_url= ask "Write me a poem about this PR"` +- **Reflect**: `python -m pr_agent.cli --pr_url= reflect` +- **Update Changelog**: `python -m pr_agent.cli --pr_url= update_changelog` + +`` is the url of the relevant PR (for example: [#50](https://github.com/Codium-ai/pr-agent/pull/50)). + +**Notes:** + +(1) in addition to editing your local configuration file, you can also change any configuration value by adding it to the command line: +``` +python -m pr_agent.cli --pr_url= /review --pr_reviewer.extra_instructions="focus on the file: ..." +``` + +(2) You can print results locally, without publishing them, by setting in `configuration.toml`: +``` +[config] +publish_output=false +verbosity_level=2 +``` +This is useful for debugging or experimenting with different tools. + +(3) + +**git provider**: The [git_provider](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L5) field in a configuration file determines the GIT provider that will be used by PR-Agent. Currently, the following providers are supported: +` +"github", "gitlab", "bitbucket", "azure", "codecommit", "local", "gerrit" +` + +Default is "github". + + + +### Online usage + +Online usage means invoking PR-Agent tools by [comments](https://github.com/Codium-ai/pr-agent/pull/229#issuecomment-1695021901) on a PR. +Commands for invoking the different tools via comments: + +- **Review**: `/review` +- **Describe**: `/describe` +- **Improve**: `/improve` (or `/improve_code` for bitbucket, since `/improve` is sometimes reserved) +- **Ask**: `/ask "..."` +- **Reflect**: `/reflect` +- **Update Changelog**: `/update_changelog` + + +To edit a specific configuration value, just add `--config_path=` to any command. +For example, if you want to edit the `review` tool configurations, you can run: +``` +/review --pr_reviewer.extra_instructions="..." --pr_reviewer.require_score_review=false +``` +Any configuration value in [configuration file](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) file can be similarly edited. Comment `/config` to see the list of available configurations. + + +## GitHub App + +!!! note "Configurations for PR-Agent Pro" + PR-Agent Pro for GitHub is an App, hosted by CodiumAI. So all the instructions below are relevant also for PR-Agent Pro users. + Same goes for [GitLab webhook](#gitlab-webhook) and [BitBucket App](#bitbucket-app) sections. + +### GitHub app automatic tools when a new PR is opened + +The [github_app](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L108) section defines GitHub app specific configurations. + +The configuration parameter `pr_commands` defines the list of tools that will be **run automatically** when a new PR is opened. +``` +[github_app] +pr_commands = [ + "/describe --pr_description.final_update_message=false", + "/review --pr_reviewer.num_code_suggestions=0", + "/improve", +] +``` +This means that when a new PR is opened/reopened or marked as ready for review, PR-Agent will run the `describe`, `review` and `improve` tools. +For the `review` tool, for example, the `num_code_suggestions` parameter will be set to 0. + +You can override the default tool parameters by using one the three options for a [configuration file](https://codium-ai.github.io/Docs-PR-Agent/usage-guide/#configuration-options): **wiki**, **local**, or **global**. +For example, if your local `.pr_agent.toml` file contains: +``` +[pr_description] +generate_ai_title = true +``` +Every time you run the `describe` tool, including automatic runs, the PR title will be generated by the AI. + +To cancel the automatic run of all the tools, set: +``` +[github_app] +pr_commands = [] +``` + +### GitHub app automatic tools for push actions (commits to an open PR) + +In addition to running automatic tools when a PR is opened, the GitHub app can also respond to new code that is pushed to an open PR. + +The configuration toggle `handle_push_trigger` can be used to enable this feature. +The configuration parameter `push_commands` defines the list of tools that will be **run automatically** when new code is pushed to the PR. +``` +[github_app] +handle_push_trigger = true +push_commands = [ + "/describe", + "/review --pr_reviewer.num_code_suggestions=0 --pr_reviewer.final_update_message=false", +] +``` +This means that when new code is pushed to the PR, the PR-Agent will run the `describe` and `review` tools, with the specified parameters. + +## GitHub Action +`GitHub Action` is a different way to trigger PR-Agent tools, and uses a different configuration mechanism than `GitHub App`.
+You can configure settings for `GitHub Action` by adding environment variables under the env section in `.github/workflows/pr_agent.yml` file. +Specifically, start by setting the following environment variables: +```yaml + env: + OPENAI_KEY: ${{ secrets.OPENAI_KEY }} # Make sure to add your OpenAI key to your repo secrets + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Make sure to add your GitHub token to your repo secrets + github_action_config.auto_review: "true" # enable\disable auto review + github_action_config.auto_describe: "true" # enable\disable auto describe + github_action_config.auto_improve: "true" # enable\disable auto improve + github_action_config.pr_actions: ["opened", "reopened", "ready_for_review", "review_requested"] +``` +`github_action_config.auto_review`, `github_action_config.auto_describe` and `github_action_config.auto_improve` are used to enable/disable automatic tools that run when a new PR is opened. +If not set, the default configuration is for all three tools to run automatically when a new PR is opened. + +`github_action_config.pr_actions` is used to configure which `pull_requests` events will trigger the enabled auto flags +If not set, the default configuration is `["opened", "reopened", "ready_for_review", "review_requested"]` + +`github_action_config.enable_output` are used to enable/disable github actions [output parameter](https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#outputs-for-docker-container-and-javascript-actions) (default is `true`). +Review result is output as JSON to `steps.{step-id}.outputs.review` property. +The JSON structure is equivalent to the yaml data structure defined in [pr_reviewer_prompts.toml](https://github.com/idubnori/pr-agent/blob/main/pr_agent/settings/pr_reviewer_prompts.toml). + +Note that you can give additional config parameters by adding environment variables to `.github/workflows/pr_agent.yml`, or by using a `.pr_agent.toml` [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/#global-configuration-file) in the root of your repo + +For example, you can set an environment variable: `pr_description.publish_labels=false`, or add a `.pr_agent.toml` file with the following content: +``` +[pr_description] +publish_labels = false +``` +to prevent PR-Agent from publishing labels when running the `describe` tool. + +## GitLab Webhook +After setting up a GitLab webhook, to control which commands will run automatically when a new MR is opened, you can set the `pr_commands` parameter in the configuration file, similar to the GitHub App: +``` +[gitlab] +pr_commands = [ + "/describe", + "/review --pr_reviewer.num_code_suggestions=0", + "/improve", +] +``` + +the GitLab webhook can also respond to new code that is pushed to an open MR. +The configuration toggle `handle_push_trigger` can be used to enable this feature. +The configuration parameter `push_commands` defines the list of tools that will be **run automatically** when new code is pushed to the MR. +``` +[gitlab] +handle_push_trigger = true +push_commands = [ + "/describe", + "/review --pr_reviewer.num_code_suggestions=0 --pr_reviewer.final_update_message=false", +] +``` + +Note that to use the 'handle_push_trigger' feature, you need to give the gitlab webhook also the "Push events" scope. + +## BitBucket App +Similar to GitHub app, when running PR-Agent from BitBucket App, the default [configuration file](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) from a pre-built docker will be initially loaded. + +By uploading a local `.pr_agent.toml` file to the root of the repo's main branch, you can edit and customize any configuration parameter. Note that you need to upload `.pr_agent.toml` prior to creating a PR, in order for the configuration to take effect. + +For example, if your local `.pr_agent.toml` file contains: +``` +[pr_reviewer] +extra_instructions = "Answer in japanese" +``` + +Each time you invoke a `/review` tool, it will use the extra instructions you set in the local configuration file. + + +Note that among other limitations, BitBucket provides relatively low rate-limits for applications (up to 1000 requests per hour), and does not provide an API to track the actual rate-limit usage. +If you experience lack of responses from PR-Agent, you might want to set: `bitbucket_app.avoid_full_files=true` in your configuration file. +This will prevent PR-Agent from acquiring the full file content, and will only use the diff content. This will reduce the number of requests made to BitBucket, at the cost of small decrease in accuracy, as dynamic context will not be applicable. + + +### BitBucket Self-Hosted App automatic tools + +To control which commands will run automatically when a new PR is opened, you can set the `pr_commands` parameter in the configuration file: +Specifically, set the following values: + +``` +[bitbucket_app] +pr_commands = [ + "/review --pr_reviewer.num_code_suggestions=0", + "/improve --pr_code_suggestions.commitable_code_suggestions=true --pr_code_suggestions.suggestions_score_threshold=7", +] +``` +Note that we set specifically for bitbucket, we recommend using: `--pr_code_suggestions.suggestions_score_threshold=7` and that is the default value we set for bitbucket. +Since this platform only supports inline code suggestions, we want to limit the number of suggestions, and only present a limited number. + +## Azure DevOps provider + +To use Azure DevOps provider use the following settings in configuration.toml: +``` +[config] +git_provider="azure" +``` + +Azure DevOps provider supports [PAT token](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?view=azure-devops&tabs=Windows) or [DefaultAzureCredential](https://learn.microsoft.com/en-us/azure/developer/python/sdk/authentication-overview#authentication-in-server-environments) authentication. +PAT is faster to create, but has build in expiration date, and will use the user identity for API calls. +Using DefaultAzureCredential you can use managed identity or Service principle, which are more secure and will create separate ADO user identity (via AAD) to the agent. + +If PAT was chosen, you can assign the value in .secrets.toml. +If DefaultAzureCredential was chosen, you can assigned the additional env vars like AZURE_CLIENT_SECRET directly, +or use managed identity/az cli (for local development) without any additional configuration. +in any case, 'org' value must be assigned in .secrets.toml: +``` +[azure_devops] +org = "https://dev.azure.com/YOUR_ORGANIZATION/" +# pat = "YOUR_PAT_TOKEN" needed only if using PAT for authentication +``` + +### Azure DevOps Webhook + +To control which commands will run automatically when a new PR is opened, you can set the `pr_commands` parameter in the configuration file, similar to the GitHub App: +``` +[azure_devops_server] +pr_commands = [ + "/describe", + "/review --pr_reviewer.num_code_suggestions=0", + "/improve", +] +``` diff --git a/docs/docs/usage-guide/changing_a_model.md b/docs/docs/usage-guide/changing_a_model.md new file mode 100644 index 000000000..9d994d173 --- /dev/null +++ b/docs/docs/usage-guide/changing_a_model.md @@ -0,0 +1,189 @@ +## Changing a model + +See [here](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/algo/__init__.py) for a list of available models. +To use a different model than the default (GPT-4), you need to edit in the [configuration file](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L2) the fields: +``` +[config] +model = "..." +model_turbo = "..." +fallback_models = ["..."] +``` + +For models and environments not from OpenAI, you might need to provide additional keys and other parameters. +You can give parameters via a configuration file (see below for instructions), or from environment variables. See [litellm documentation](https://litellm.vercel.app/docs/proxy/quick_start#supported-llms) for the environment variables relevant per model. + +### Azure + +To use Azure, set in your `.secrets.toml` (working from CLI), or in the GitHub `Settings > Secrets and variables` (working from GitHub App or GitHub Action): +``` +[openai] +key = "" # your azure api key +api_type = "azure" +api_version = '2023-05-15' # Check Azure documentation for the current API version +api_base = "" # The base URL for your Azure OpenAI resource. e.g. "https://.openai.azure.com" +deployment_id = "" # The deployment name you chose when you deployed the engine +``` + +and set in your configuration file: +``` +[config] +model="" # the OpenAI model you've deployed on Azure (e.g. gpt-3.5-turbo) +model_turbo="" # the OpenAI model you've deployed on Azure (e.g. gpt-3.5-turbo) +fallback_models=["..."] # the OpenAI model you've deployed on Azure (e.g. gpt-3.5-turbo) +``` + +### Hugging Face + +**Local** +You can run Hugging Face models locally through either [VLLM](https://docs.litellm.ai/docs/providers/vllm) or [Ollama](https://docs.litellm.ai/docs/providers/ollama) + +E.g. to use a new Hugging Face model locally via Ollama, set: +``` +[__init__.py] +MAX_TOKENS = { + "model-name-on-ollama": +} +e.g. +MAX_TOKENS={ + ..., + "ollama/llama2": 4096 +} + + +[config] # in configuration.toml +model = "ollama/llama2" +model_turbo = "ollama/llama2" +fallback_models=["ollama/llama2"] + +[ollama] # in .secrets.toml +api_base = ... # the base url for your Hugging Face inference endpoint +# e.g. if running Ollama locally, you may use: +api_base = "http://localhost:11434/" +``` + +### Inference Endpoints + +To use a new model with Hugging Face Inference Endpoints, for example, set: +``` +[__init__.py] +MAX_TOKENS = { + "model-name-on-huggingface": +} +e.g. +MAX_TOKENS={ + ..., + "meta-llama/Llama-2-7b-chat-hf": 4096 +} +[config] # in configuration.toml +model = "huggingface/meta-llama/Llama-2-7b-chat-hf" +model_turbo = "huggingface/meta-llama/Llama-2-7b-chat-hf" +fallback_models=["huggingface/meta-llama/Llama-2-7b-chat-hf"] + +[huggingface] # in .secrets.toml +key = ... # your Hugging Face api key +api_base = ... # the base url for your Hugging Face inference endpoint +``` +(you can obtain a Llama2 key from [here](https://replicate.com/replicate/llama-2-70b-chat/api)) + +### Replicate + +To use Llama2 model with Replicate, for example, set: +``` +[config] # in configuration.toml +model = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" +model_turbo = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" +fallback_models=["replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"] +[replicate] # in .secrets.toml +key = ... +``` +(you can obtain a Llama2 key from [here](https://replicate.com/replicate/llama-2-70b-chat/api)) + + +Also, review the [AiHandler](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/algo/ai_handler.py) file for instructions on how to set keys for other models. + +### Groq + +To use Llama3 model with Groq, for example, set: +``` +[config] # in configuration.toml +model = "llama3-70b-8192" +model_turbo = "llama3-70b-8192" +fallback_models = ["groq/llama3-70b-8192"] +[groq] # in .secrets.toml +key = ... # your Groq api key +``` +(you can obtain a Groq key from [here](https://console.groq.com/keys)) + +### Vertex AI + +To use Google's Vertex AI platform and its associated models (chat-bison/codechat-bison) set: + +``` +[config] # in configuration.toml +model = "vertex_ai/codechat-bison" +model_turbo = "vertex_ai/codechat-bison" +fallback_models="vertex_ai/codechat-bison" + +[vertexai] # in .secrets.toml +vertex_project = "my-google-cloud-project" +vertex_location = "" +``` + +Your [application default credentials](https://cloud.google.com/docs/authentication/application-default-credentials) will be used for authentication so there is no need to set explicit credentials in most environments. + +If you do want to set explicit credentials, then you can use the `GOOGLE_APPLICATION_CREDENTIALS` environment variable set to a path to a json credentials file. + +### Anthropic + +To use Anthropic models, set the relevant models in the configuration section of the configuration file: +``` +[config] +model="anthropic/claude-3-opus-20240229" +model_turbo="anthropic/claude-3-opus-20240229" +fallback_models=["anthropic/claude-3-opus-20240229"] +``` + +And also set the api key in the .secrets.toml file: +``` +[anthropic] +KEY = "..." +``` + +### Amazon Bedrock + +To use Amazon Bedrock and its foundational models, add the below configuration: + +``` +[config] # in configuration.toml +model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0" +model_turbo="bedrock/anthropic.claude-3-sonnet-20240229-v1:0" +fallback_models=["bedrock/anthropic.claude-v2:1"] +``` + +Note that you have to add access to foundational models before using them. Please refer to [this document](https://docs.aws.amazon.com/bedrock/latest/userguide/setting-up.html) for more details. + +If you are using the claude-3 model, please configure the following settings as there are parameters incompatible with claude-3. +``` +[litellm] +drop_params = true +``` + +AWS session is automatically authenticated from your environment, but you can also explicitly set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_REGION_NAME` environment variables. Please refer to [this document](https://litellm.vercel.app/docs/providers/bedrock) for more details. + +### Custom models + +If the relevant model doesn't appear [here](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/algo/__init__.py), you can still use it as a custom model: + +(1) Set the model name in the configuration file: +``` +[config] +model="custom_model_name" +model_turbo="custom_model_name" +fallback_models=["custom_model_name"] +``` +(2) Set the maximal tokens for the model: +``` +[config] +custom_model_max_tokens= ... +``` +(3) Go to [litellm documentation](https://litellm.vercel.app/docs/proxy/quick_start#supported-llms), find the model you want to use, and set the relevant environment variables. diff --git a/docs/docs/usage-guide/configuration_options.md b/docs/docs/usage-guide/configuration_options.md new file mode 100644 index 000000000..70e9f9f8f --- /dev/null +++ b/docs/docs/usage-guide/configuration_options.md @@ -0,0 +1,71 @@ +The different tools and sub-tools used by CodiumAI PR-Agent are adjustable via the **[configuration file](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml)**. + +In addition to general configuration options, each tool has its own configurations. For example, the `review` tool will use parameters from the [pr_reviewer](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L16) section in the configuration file. +See the [Tools Guide](https://codium-ai.github.io/Docs-PR-Agent/tools/) for a detailed description of the different tools and their configurations. + +There are three ways to set persistent configurations: + +1. Wiki configuration page ๐Ÿ’Ž +2. Local configuration file +3. Global configuration file ๐Ÿ’Ž + +In terms of precedence, wiki configurations will override local configurations, and local configurations will override global configurations. + +!!! tip "Tip1: edit only what you need" + Your configuration file should be minimal, and edit only the relevant values. Don't copy the entire configuration options, since it can lead to legacy problems when something changes. +!!! tip "Tip2: show relevant configurations" + If you set `config.output_relevant_configurations=true`, each tool will also output in a collapsible section its relevant configurations. This can be useful for debugging, or getting to know the configurations better. + +## Wiki configuration file ๐Ÿ’Ž + +`Platforms supported: GitHub, GitLab` + +With PR-Agent-Pro, you can set configurations by creating a page called `.pr_agent.toml` in the [wiki](https://github.com/Codium-ai/pr-agent/wiki/pr_agent.toml) of the repo. +The advantage of this method is that it allows to set configurations without needing to commit new content to the repo - just edit the wiki page and **save**. + + +![wiki_configuration](https://codium.ai/images/pr_agent/wiki_configuration.png){width=512} + +Click [here](https://codium.ai/images/pr_agent/wiki_configuration_pr_agent.mp4) to see a short instructional video. We recommend surrounding the configuration content with triple-quotes (or \`\`\`toml), to allow better presentation when displayed in the wiki as markdown. +An example content: + +```toml +[pr_description] +generate_ai_title=true +``` + +PR-Agent will know to remove the surrounding quotes when reading the configuration content. + +## Local configuration file + +`Platforms supported: GitHub, GitLab, Bitbucket, Azure DevOps` + + +By uploading a local `.pr_agent.toml` file to the root of the repo's main branch, you can edit and customize any configuration parameter. Note that you need to upload `.pr_agent.toml` prior to creating a PR, in order for the configuration to take effect. + +For example, if you set in `.pr_agent.toml`: + +``` +[pr_reviewer] +extra_instructions="""\ +- instruction a +- instruction b +... +""" +``` + +Then you can give a list of extra instructions to the `review` tool. + + +## Global configuration file ๐Ÿ’Ž + +`Platforms supported: GitHub, GitLab, Bitbucket` + +If you create a repo called `pr-agent-settings` in your **organization**, it's configuration file `.pr_agent.toml` will be used as a global configuration file for any other repo that belongs to the same organization. +Parameters from a local `.pr_agent.toml` file, in a specific repo, will override the global configuration parameters. + +For example, in the GitHub organization `Codium-ai`: + +- The file [`https://github.com/Codium-ai/pr-agent-settings/.pr_agent.toml`](https://github.com/Codium-ai/pr-agent-settings/blob/main/.pr_agent.toml) serves as a global configuration file for all the repos in the GitHub organization `Codium-ai`. + +- The repo [`https://github.com/Codium-ai/pr-agent`](https://github.com/Codium-ai/pr-agent/blob/main/.pr_agent.toml) inherits the global configuration file from `pr-agent-settings`. diff --git a/docs/docs/usage-guide/index.md b/docs/docs/usage-guide/index.md new file mode 100644 index 000000000..637048c19 --- /dev/null +++ b/docs/docs/usage-guide/index.md @@ -0,0 +1,26 @@ +# Usage guide + +This page provides a detailed guide on how to use PR-Agent. +It includes information on how to adjust PR-Agent configurations, define which tools will run automatically, and other advanced configurations. + + +- [Introduction](./introduction.md) +- [Configuration File](./configuration_options.md) +- [Usage and Automation](./automations_and_usage.md) + - [Local Repo (CLI)](./automations_and_usage.md#local-repo-cli) + - [Online Usage](./automations_and_usage.md#online-usage) + - [GitHub App](./automations_and_usage.md#github-app) + - [GitHub Action](./automations_and_usage.md#github-action) + - [GitLab Webhook](./automations_and_usage.md#gitlab-webhook) + - [BitBucket App](./automations_and_usage.md#bitbucket-app) + - [Azure DevOps Provider](./automations_and_usage.md#azure-devops-provider) +- [Managing Mail Notifications](./mail_notifications.md) +- [Changing a Model](./changing_a_model.md) +- [Additional Configurations Walkthrough](./additional_configurations.md) + - [Ignoring files from analysis](./additional_configurations.md#ignoring-files-from-analysis) + - [Extra instructions](./additional_configurations.md#extra-instructions) + - [Working with large PRs](./additional_configurations.md#working-with-large-prs) + - [Changing a model](./additional_configurations.md#changing-a-model) + - [Patch Extra Lines](./additional_configurations.md#patch-extra-lines) + - [Editing the prompts](./additional_configurations.md#editing-the-prompts) +- [PR-Agent Pro Models](./PR_agent_pro_models.md) \ No newline at end of file diff --git a/docs/docs/usage-guide/introduction.md b/docs/docs/usage-guide/introduction.md new file mode 100644 index 000000000..6fa6855d4 --- /dev/null +++ b/docs/docs/usage-guide/introduction.md @@ -0,0 +1,13 @@ + +After [installation](https://pr-agent-docs.codium.ai/installation/), there are three basic ways to invoke CodiumAI PR-Agent: + +1. Locally running a CLI command +2. Online usage - by [commenting](https://github.com/Codium-ai/pr-agent/pull/229#issuecomment-1695021901) on a PR +3. Enabling PR-Agent tools to run automatically when a new PR is opened + + +Specifically, CLI commands can be issued by invoking a pre-built [docker image](https://pr-agent-docs.codium.ai/installation/locally/#using-docker-image), or by invoking a [locally cloned repo](https://pr-agent-docs.codium.ai/installation/locally/#run-from-source). + +For online usage, you will need to setup either a [GitHub App](https://pr-agent-docs.codium.ai/installation/github/#run-as-a-github-app) or a [GitHub Action](https://pr-agent-docs.codium.ai/installation/github/#run-as-a-github-action) (GitHub), a [GitLab webhook](https://pr-agent-docs.codium.ai/installation/gitlab/#run-a-gitlab-webhook-server) (GitLab), or a [BitBucket App](https://pr-agent-docs.codium.ai/installation/bitbucket/#run-using-codiumai-hosted-bitbucket-app) (BitBucket). +These platforms also enable to run PR-Agent specific tools automatically when a new PR is opened, or on each push to a branch. + diff --git a/docs/docs/usage-guide/mail_notifications.md b/docs/docs/usage-guide/mail_notifications.md new file mode 100644 index 000000000..f25c7eaa0 --- /dev/null +++ b/docs/docs/usage-guide/mail_notifications.md @@ -0,0 +1,18 @@ + +Unfortunately, it is not possible in GitHub to disable mail notifications from a specific user. +If you are subscribed to notifications for a repo with PR-Agent, we recommend turning off notifications for PR comments, to avoid lengthy emails: + +![notifications](https://codium.ai/images/pr_agent/notifications.png){width=512} + +As an alternative, you can filter in your mail provider the notifications specifically from the PR-Agent bot, [see how](https://www.quora.com/How-can-you-filter-emails-for-specific-people-in-Gmail#:~:text=On%20the%20Filters%20and%20Blocked,the%20body%20of%20the%20email). + +![filter_mail_notifications](https://codium.ai/images/pr_agent/filter_mail_notifications.png){width=512} + + +Another option to reduce the mail overload, yet still receive notifications on PR-Agent tools, is to disable the help collapsible section in PR-Agent bot comments. +This can done by setting `enable_help_text=false` for the relevant tool in the configuration file. +For example, to disable the help text for the `pr_reviewer` tool, set: +``` +[pr_reviewer] +enable_help_text = false +``` \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 000000000..1e0ce4d35 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,156 @@ +site_name: PR-Agent Documentation +repo_url: https://github.com/Codium-ai/pr-agent +repo_name: Codium-ai/pr-agent + +nav: + - Overview: + - 'index.md' + - ๐Ÿ’Ž PR-Agent Pro: 'overview/pr_agent_pro.md' + - Data Privacy: 'overview/data_privacy.md' + - Installation: + - 'installation/index.md' + - Locally: 'installation/locally.md' + - GitHub: 'installation/github.md' + - GitLab: 'installation/gitlab.md' + - BitBucket: 'installation/bitbucket.md' + - Azure DevOps: 'installation/azure.md' + - ๐Ÿ’Ž PR-Agent Pro: 'installation/pr_agent_pro.md' + - Usage Guide: + - 'usage-guide/index.md' + - Introduction: 'usage-guide/introduction.md' + - Configuration File: 'usage-guide/configuration_options.md' + - Usage and Automation: 'usage-guide/automations_and_usage.md' + - Managing Mail Notifications: 'usage-guide/mail_notifications.md' + - Changing a Model: 'usage-guide/changing_a_model.md' + - Additional Configurations: 'usage-guide/additional_configurations.md' + - ๐Ÿ’Ž PR-Agent Pro Models: 'usage-guide/PR_agent_pro_models' + - Tools: + - 'tools/index.md' + - Describe: 'tools/describe.md' + - Review: 'tools/review.md' + - Improve: 'tools/improve.md' + - Ask: 'tools/ask.md' + - Update Changelog: 'tools/update_changelog.md' + - Similar Issues: 'tools/similar_issues.md' + - Help: 'tools/help.md' + - ๐Ÿ’Ž Analyze: 'tools/analyze.md' + - ๐Ÿ’Ž Test: 'tools/test.md' + - ๐Ÿ’Ž Improve Component: 'tools/improve_component.md' + - ๐Ÿ’Ž Documentation: 'tools/documentation.md' + - ๐Ÿ’Ž Custom Labels: 'tools/custom_labels.md' + - ๐Ÿ’Ž Custom Prompt: 'tools/custom_prompt.md' + - ๐Ÿ’Ž CI Feedback: 'tools/ci_feedback.md' + - ๐Ÿ’Ž Similar Code: 'tools/similar_code.md' + - Core Abilities: + - 'core-abilities/index.md' + - Local and global metadata: 'core-abilities/metadata.md' + - Dynamic context: 'core-abilities/dynamic_context.md' + - Self-reflection: 'core-abilities/self_reflection.md' + - Impact evaluation: 'core-abilities/impact_evaluation.md' + - Interactivity: 'core-abilities/interactivity.md' + - Compression strategy: 'core-abilities/compression_strategy.md' + - Code-oriented YAML: 'core-abilities/code_oriented_yaml.md' + - Static code analysis: 'core-abilities/static_code_analysis.md' + - Code Fine-tuning Benchmark: 'finetuning_benchmark/index.md' + - Chrome Extension: + - PR-Agent Chrome Extension: 'chrome-extension/index.md' + - Features: 'chrome-extension/features.md' + - Data Privacy: 'chrome-extension/data_privacy.md' + - FAQ: + - FAQ: 'faq/index.md' +# - Code Fine-tuning Benchmark: 'finetuning_benchmark/index.md' + +theme: + logo: assets/logo.svg + favicon: assets/favicon.ico + name: material + icon: + repo: fontawesome/brands/github + features: + - navigation.tabs + - navigation.expand + - navigation.path + - navigation.top + - navigation.tracking + - navigation.indexes + - search.suggest + - search.highlight + - content.tabs.link + - content.code.annotation + - content.code.copy + - content.tabs.link + language: en + custom_dir: overrides + + palette: + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to light mode + - media: "(prefers-color-scheme: light)" + scheme: default + toggle: + icon: material/toggle-switch-off-outline + name: Switch to dark mode + primary: custom + accent: custom + - media: "(prefers-color-scheme: dark)" + scheme: slate + toggle: + icon: material/toggle-switch + name: Switch to light mode + primary: custom + accent: custom + +plugins: + - social + - search + - glightbox + +extra: + generator: false + social: + - icon: fontawesome/brands/github + link: https://github.com/Codium-ai + - icon: fontawesome/brands/discord + link: https://discord.com/invite/SgSxuQ65GF + - icon: fontawesome/brands/youtube + link: https://www.youtube.com/@Codium-AI + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/company/codiumai + - icon: fontawesome/brands/twitter + link: https://twitter.com/CodiumAI + - icon: fontawesome/brands/instagram + link: https://www.instagram.com/codiumai/ + analytics: + provider: custom + property: ${{ secrets.GOOGLE_ANALYTICS_ID }} + +extra_css: + - css/custom.css + +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - admonition + - pymdownx.arithmatex: + generic: true + - footnotes + - pymdownx.details + - pymdownx.superfences + - pymdownx.mark + - md_in_html + - attr_list + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - toc: + title: On this page + toc_depth: 3 + permalink: true + + +copyright: | + © 2024 CodiumAI diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 000000000..93a67950d --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,10 @@ +{% extends "base.html" %} + +{% block scripts %} + {{ super() }} + + + + +{% endblock %} \ No newline at end of file diff --git a/docs/overrides/partials/footer.html b/docs/overrides/partials/footer.html new file mode 100644 index 000000000..b3235a694 --- /dev/null +++ b/docs/overrides/partials/footer.html @@ -0,0 +1,115 @@ + + + + + +Footer + + + + + + + + diff --git a/docs/overrides/partials/integrations/analytics/custom.html b/docs/overrides/partials/integrations/analytics/custom.html new file mode 100644 index 000000000..9a0785d2a --- /dev/null +++ b/docs/overrides/partials/integrations/analytics/custom.html @@ -0,0 +1,7 @@ + + + \ No newline at end of file diff --git a/pics/logo-dark.png b/pics/logo-dark.png deleted file mode 100644 index 852fdb1ae..000000000 Binary files a/pics/logo-dark.png and /dev/null differ diff --git a/pics/logo-light.png b/pics/logo-light.png deleted file mode 100644 index e329742f2..000000000 Binary files a/pics/logo-light.png and /dev/null differ diff --git a/pr_agent/agent/pr_agent.py b/pr_agent/agent/pr_agent.py index 2ab13d69f..8bf6cff75 100644 --- a/pr_agent/agent/pr_agent.py +++ b/pr_agent/agent/pr_agent.py @@ -1,20 +1,28 @@ -import logging -import os import shlex -import tempfile +from functools import partial + +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings -from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers.utils import apply_repo_settings +from pr_agent.log import get_logger +from pr_agent.tools.pr_add_docs import PRAddDocs from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions +from pr_agent.tools.pr_config import PRConfig from pr_agent.tools.pr_description import PRDescription +from pr_agent.tools.pr_generate_labels import PRGenerateLabels +from pr_agent.tools.pr_help_message import PRHelpMessage from pr_agent.tools.pr_information_from_user import PRInformationFromUser +from pr_agent.tools.pr_line_questions import PR_LineQuestions from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_reviewer import PRReviewer +from pr_agent.tools.pr_similar_issue import PRSimilarIssue from pr_agent.tools.pr_update_changelog import PRUpdateChangelog -from pr_agent.tools.pr_config import PRConfig command2class = { + "auto_review": PRReviewer, "answer": PRReviewer, "review": PRReviewer, "review_pr": PRReviewer, @@ -26,54 +34,66 @@ "improve_code": PRCodeSuggestions, "ask": PRQuestions, "ask_question": PRQuestions, + "ask_line": PR_LineQuestions, "update_changelog": PRUpdateChangelog, "config": PRConfig, "settings": PRConfig, + "help": PRHelpMessage, + "similar_issue": PRSimilarIssue, + "add_docs": PRAddDocs, + "generate_labels": PRGenerateLabels, } commands = list(command2class.keys()) + class PRAgent: - def __init__(self): - pass + def __init__(self, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): + self.ai_handler = ai_handler # will be initialized in run_action + self.forbidden_cli_args = ['enable_auto_approval'] async def handle_request(self, pr_url, request, notify=None) -> bool: # First, apply repo specific settings if exists - if get_settings().config.use_repo_settings_file: - repo_settings_file = None - try: - git_provider = get_git_provider()(pr_url) - repo_settings = git_provider.get_repo_settings() - if repo_settings: - repo_settings_file = None - fd, repo_settings_file = tempfile.mkstemp(suffix='.toml') - os.write(fd, repo_settings) - get_settings().load_file(repo_settings_file) - finally: - if repo_settings_file: - try: - os.remove(repo_settings_file) - except Exception as e: - logging.error(f"Failed to remove temporary settings file {repo_settings_file}", e) + apply_repo_settings(pr_url) # Then, apply user specific settings if exists - request = request.replace("'", "\\'") - lexer = shlex.shlex(request, posix=True) - lexer.whitespace_split = True - action, *args = list(lexer) + if isinstance(request, str): + request = request.replace("'", "\\'") + lexer = shlex.shlex(request, posix=True) + lexer.whitespace_split = True + action, *args = list(lexer) + else: + action, *args = request + + if args: + for forbidden_arg in self.forbidden_cli_args: + for arg in args: + if forbidden_arg in arg: + get_logger().error( + f"CLI argument for param '{forbidden_arg}' is forbidden. Use instead a configuration file." + ) + return False args = update_settings_from_args(args) action = action.lstrip("/").lower() - if action == "reflect_and_review" and not get_settings().pr_reviewer.ask_and_reflect: - action = "review" - if action == "answer": - if notify: - notify() - await PRReviewer(pr_url, is_answer=True, args=args).run() - elif action in command2class: - if notify: - notify() - await command2class[action](pr_url, args=args).run() - else: + if action not in command2class: + get_logger().debug(f"Unknown command: {action}") return False - return True + with get_logger().contextualize(command=action, pr_url=pr_url): + get_logger().info("PR-Agent request handler started", analytics=True) + if action == "reflect_and_review": + get_settings().pr_reviewer.ask_and_reflect = True + if action == "answer": + if notify: + notify() + await PRReviewer(pr_url, is_answer=True, args=args, ai_handler=self.ai_handler).run() + elif action == "auto_review": + await PRReviewer(pr_url, is_auto=True, args=args, ai_handler=self.ai_handler).run() + elif action in command2class: + if notify: + notify() + + await command2class[action](pr_url, ai_handler=self.ai_handler, args=args).run() + else: + return False + return True diff --git a/pr_agent/algo/__init__.py b/pr_agent/algo/__init__.py index 798fc6c5e..41ee47b67 100644 --- a/pr_agent/algo/__init__.py +++ b/pr_agent/algo/__init__.py @@ -1,14 +1,67 @@ MAX_TOKENS = { - 'gpt-3.5-turbo': 4000, + 'text-embedding-ada-002': 8000, + 'gpt-3.5-turbo': 16000, + 'gpt-3.5-turbo-0125': 16000, 'gpt-3.5-turbo-0613': 4000, - 'gpt-3.5-turbo-0301': 4000, + 'gpt-3.5-turbo-1106': 16000, 'gpt-3.5-turbo-16k': 16000, 'gpt-3.5-turbo-16k-0613': 16000, 'gpt-4': 8000, 'gpt-4-0613': 8000, 'gpt-4-32k': 32000, + 'gpt-4-1106-preview': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4-0125-preview': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4o': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4o-2024-05-13': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4-turbo-preview': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4-turbo-2024-04-09': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4-turbo': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4o-mini': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4o-mini-2024-07-18': 128000, # 128K, but may be limited by config.max_model_tokens + 'gpt-4o-2024-08-06': 128000, # 128K, but may be limited by config.max_model_tokens + 'o1-mini': 128000, # 128K, but may be limited by config.max_model_tokens + 'o1-mini-2024-09-12': 128000, # 128K, but may be limited by config.max_model_tokens + 'o1-preview': 128000, # 128K, but may be limited by config.max_model_tokens + 'o1-preview-2024-09-12': 128000, # 128K, but may be limited by config.max_model_tokens 'claude-instant-1': 100000, 'claude-2': 100000, 'command-nightly': 4096, 'replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1': 4096, + 'meta-llama/Llama-2-7b-chat-hf': 4096, + 'vertex_ai/codechat-bison': 6144, + 'vertex_ai/codechat-bison-32k': 32000, + 'vertex_ai/claude-3-haiku@20240307': 100000, + 'vertex_ai/claude-3-sonnet@20240229': 100000, + 'vertex_ai/claude-3-opus@20240229': 100000, + 'vertex_ai/claude-3-5-sonnet@20240620': 100000, + 'vertex_ai/gemini-1.5-pro': 1048576, + 'vertex_ai/gemini-1.5-flash': 1048576, + 'vertex_ai/gemma2': 8200, + 'codechat-bison': 6144, + 'codechat-bison-32k': 32000, + 'anthropic.claude-instant-v1': 100000, + 'anthropic.claude-v1': 100000, + 'anthropic.claude-v2': 100000, + 'anthropic/claude-3-opus-20240229': 100000, + 'anthropic/claude-3-5-sonnet-20240620': 100000, + 'bedrock/anthropic.claude-instant-v1': 100000, + 'bedrock/anthropic.claude-v2': 100000, + 'bedrock/anthropic.claude-v2:1': 100000, + 'bedrock/anthropic.claude-3-sonnet-20240229-v1:0': 100000, + 'bedrock/anthropic.claude-3-haiku-20240307-v1:0': 100000, + 'bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0': 100000, + 'claude-3-5-sonnet': 100000, + 'groq/llama3-8b-8192': 8192, + 'groq/llama3-70b-8192': 8192, + 'groq/mixtral-8x7b-32768': 32768, + 'groq/llama-3.1-8b-instant': 131072, + 'groq/llama-3.1-70b-versatile': 131072, + 'groq/llama-3.1-405b-reasoning': 131072, + 'ollama/llama3': 4096, + 'watsonx/meta-llama/llama-3-8b-instruct': 4096, + "watsonx/meta-llama/llama-3-70b-instruct": 4096, + "watsonx/meta-llama/llama-3-405b-instruct": 16384, + "watsonx/ibm/granite-13b-chat-v2": 8191, + "watsonx/ibm/granite-34b-code-instruct": 8191, + "watsonx/mistralai/mistral-large": 32768, } diff --git a/pr_agent/algo/ai_handler.py b/pr_agent/algo/ai_handler.py deleted file mode 100644 index 27e9533a4..000000000 --- a/pr_agent/algo/ai_handler.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging - -import litellm -import openai -from litellm import acompletion -from openai.error import APIError, RateLimitError, Timeout, TryAgain -from retry import retry - -from pr_agent.config_loader import get_settings - -OPENAI_RETRIES = 5 - - -class AiHandler: - """ - This class handles interactions with the OpenAI API for chat completions. - It initializes the API key and other settings from a configuration file, - and provides a method for performing chat completions using the OpenAI ChatCompletion API. - """ - - def __init__(self): - """ - Initializes the OpenAI API key and other settings from a configuration file. - Raises a ValueError if the OpenAI key is missing. - """ - try: - openai.api_key = get_settings().openai.key - litellm.openai_key = get_settings().openai.key - self.azure = False - if get_settings().get("OPENAI.ORG", None): - litellm.organization = get_settings().openai.org - self.deployment_id = get_settings().get("OPENAI.DEPLOYMENT_ID", None) - if get_settings().get("OPENAI.API_TYPE", None): - if get_settings().openai.api_type == "azure": - self.azure = True - litellm.azure_key = get_settings().openai.key - if get_settings().get("OPENAI.API_VERSION", None): - litellm.api_version = get_settings().openai.api_version - if get_settings().get("OPENAI.API_BASE", None): - litellm.api_base = get_settings().openai.api_base - if get_settings().get("ANTHROPIC.KEY", None): - litellm.anthropic_key = get_settings().anthropic.key - if get_settings().get("COHERE.KEY", None): - litellm.cohere_key = get_settings().cohere.key - if get_settings().get("REPLICATE.KEY", None): - litellm.replicate_key = get_settings().replicate.key - except AttributeError as e: - raise ValueError("OpenAI key is required") from e - - @retry(exceptions=(APIError, Timeout, TryAgain, AttributeError, RateLimitError), - tries=OPENAI_RETRIES, delay=2, backoff=2, jitter=(1, 3)) - async def chat_completion(self, model: str, temperature: float, system: str, user: str): - """ - Performs a chat completion using the OpenAI ChatCompletion API. - Retries in case of API errors or timeouts. - - Args: - model (str): The model to use for chat completion. - temperature (float): The temperature parameter for chat completion. - system (str): The system message for chat completion. - user (str): The user message for chat completion. - - Returns: - tuple: A tuple containing the response and finish reason from the API. - - Raises: - TryAgain: If the API response is empty or there are no choices in the response. - APIError: If there is an error during OpenAI inference. - Timeout: If there is a timeout during OpenAI inference. - TryAgain: If there is an attribute error during OpenAI inference. - """ - try: - response = await acompletion( - model=model, - deployment_id=self.deployment_id, - messages=[ - {"role": "system", "content": system}, - {"role": "user", "content": user} - ], - temperature=temperature, - azure=self.azure, - force_timeout=get_settings().config.ai_timeout - ) - except (APIError, Timeout, TryAgain) as e: - logging.error("Error during OpenAI inference: ", e) - raise - except (RateLimitError) as e: - logging.error("Rate limit error during OpenAI inference: ", e) - raise - except (Exception) as e: - logging.error("Unknown error during OpenAI inference: ", e) - raise TryAgain from e - if response is None or len(response["choices"]) == 0: - raise TryAgain - resp = response["choices"][0]['message']['content'] - finish_reason = response["choices"][0]["finish_reason"] - print(resp, finish_reason) - return resp, finish_reason diff --git a/pr_agent/algo/ai_handlers/base_ai_handler.py b/pr_agent/algo/ai_handlers/base_ai_handler.py new file mode 100644 index 000000000..e3274eac4 --- /dev/null +++ b/pr_agent/algo/ai_handlers/base_ai_handler.py @@ -0,0 +1,28 @@ +from abc import ABC, abstractmethod + + +class BaseAiHandler(ABC): + """ + This class defines the interface for an AI handler to be used by the PR Agents. + """ + + @abstractmethod + def __init__(self): + pass + + @property + @abstractmethod + def deployment_id(self): + pass + + @abstractmethod + async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2, img_path: str = None): + """ + This method should be implemented to return a chat completion from the AI model. + Args: + model (str): the name of the model to use for the chat completion + system (str): the system message string to use for the chat completion + user (str): the user message string to use for the chat completion + temperature (float): the temperature to use for the chat completion + """ + pass diff --git a/pr_agent/algo/ai_handlers/langchain_ai_handler.py b/pr_agent/algo/ai_handlers/langchain_ai_handler.py new file mode 100644 index 000000000..2f3b88c13 --- /dev/null +++ b/pr_agent/algo/ai_handlers/langchain_ai_handler.py @@ -0,0 +1,76 @@ +try: + from langchain_openai import ChatOpenAI, AzureChatOpenAI + from langchain_core.messages import SystemMessage, HumanMessage +except: # we don't enforce langchain as a dependency, so if it's not installed, just move on + pass + +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.config_loader import get_settings +from pr_agent.log import get_logger + +from openai import APIError, RateLimitError, Timeout +from retry import retry +import functools + +OPENAI_RETRIES = 5 + + +class LangChainOpenAIHandler(BaseAiHandler): + def __init__(self): + # Initialize OpenAIHandler specific attributes here + super().__init__() + self.azure = get_settings().get("OPENAI.API_TYPE", "").lower() == "azure" + + # Create a default unused chat object to trigger early validation + self._create_chat(self.deployment_id) + + def chat(self, messages: list, model: str, temperature: float): + chat = self._create_chat(self.deployment_id) + return chat.invoke(input=messages, model=model, temperature=temperature) + + @property + def deployment_id(self): + """ + Returns the deployment ID for the OpenAI API. + """ + return get_settings().get("OPENAI.DEPLOYMENT_ID", None) + + @retry(exceptions=(APIError, Timeout, AttributeError, RateLimitError), + tries=OPENAI_RETRIES, delay=2, backoff=2, jitter=(1, 3)) + async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2): + try: + messages = [SystemMessage(content=system), HumanMessage(content=user)] + + # get a chat completion from the formatted messages + resp = self.chat(messages, model=model, temperature=temperature) + finish_reason = "completed" + return resp.content, finish_reason + + except (Exception) as e: + get_logger().error("Unknown error during OpenAI inference: ", e) + raise e + + def _create_chat(self, deployment_id=None): + try: + if self.azure: + # using a partial function so we can set the deployment_id later to support fallback_deployments + # but still need to access the other settings now so we can raise a proper exception if they're missing + return AzureChatOpenAI( + openai_api_key=get_settings().openai.key, + openai_api_version=get_settings().openai.api_version, + azure_deployment=deployment_id, + azure_endpoint=get_settings().openai.api_base, + ) + else: + # for llms that compatible with openai, should use custom api base + openai_api_base = get_settings().get("OPENAI.API_BASE", None) + if openai_api_base is None or len(openai_api_base) == 0: + return ChatOpenAI(openai_api_key=get_settings().openai.key) + else: + return ChatOpenAI(openai_api_key=get_settings().openai.key, openai_api_base=openai_api_base) + except AttributeError as e: + if getattr(e, "name"): + raise ValueError(f"OpenAI {e.name} is required") from e + else: + raise e + diff --git a/pr_agent/algo/ai_handlers/litellm_ai_handler.py b/pr_agent/algo/ai_handlers/litellm_ai_handler.py new file mode 100644 index 000000000..438123862 --- /dev/null +++ b/pr_agent/algo/ai_handlers/litellm_ai_handler.py @@ -0,0 +1,241 @@ +import os +import requests +import litellm +import openai +from litellm import acompletion +from tenacity import retry, retry_if_exception_type, stop_after_attempt + +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.config_loader import get_settings +from pr_agent.log import get_logger + +OPENAI_RETRIES = 5 + + +class LiteLLMAIHandler(BaseAiHandler): + """ + This class handles interactions with the OpenAI API for chat completions. + It initializes the API key and other settings from a configuration file, + and provides a method for performing chat completions using the OpenAI ChatCompletion API. + """ + + def __init__(self): + """ + Initializes the OpenAI API key and other settings from a configuration file. + Raises a ValueError if the OpenAI key is missing. + """ + self.azure = False + self.api_base = None + self.repetition_penalty = None + if get_settings().get("OPENAI.KEY", None): + openai.api_key = get_settings().openai.key + litellm.openai_key = get_settings().openai.key + elif 'OPENAI_API_KEY' not in os.environ: + litellm.api_key = "dummy_key" + if get_settings().get("aws.AWS_ACCESS_KEY_ID"): + assert get_settings().aws.AWS_SECRET_ACCESS_KEY and get_settings().aws.AWS_REGION_NAME, "AWS credentials are incomplete" + os.environ["AWS_ACCESS_KEY_ID"] = get_settings().aws.AWS_ACCESS_KEY_ID + os.environ["AWS_SECRET_ACCESS_KEY"] = get_settings().aws.AWS_SECRET_ACCESS_KEY + os.environ["AWS_REGION_NAME"] = get_settings().aws.AWS_REGION_NAME + if get_settings().get("litellm.use_client"): + litellm_token = get_settings().get("litellm.LITELLM_TOKEN") + assert litellm_token, "LITELLM_TOKEN is required" + os.environ["LITELLM_TOKEN"] = litellm_token + litellm.use_client = True + if get_settings().get("LITELLM.DROP_PARAMS", None): + litellm.drop_params = get_settings().litellm.drop_params + if get_settings().get("LITELLM.SUCCESS_CALLBACK", None): + litellm.success_callback = get_settings().litellm.success_callback + if get_settings().get("LITELLM.FAILURE_CALLBACK", None): + litellm.failure_callback = get_settings().litellm.failure_callback + if get_settings().get("LITELLM.SERVICE_CALLBACK", None): + litellm.service_callback = get_settings().litellm.service_callback + if get_settings().get("OPENAI.ORG", None): + litellm.organization = get_settings().openai.org + if get_settings().get("OPENAI.API_TYPE", None): + if get_settings().openai.api_type == "azure": + self.azure = True + litellm.azure_key = get_settings().openai.key + if get_settings().get("OPENAI.API_VERSION", None): + litellm.api_version = get_settings().openai.api_version + if get_settings().get("OPENAI.API_BASE", None): + litellm.api_base = get_settings().openai.api_base + if get_settings().get("ANTHROPIC.KEY", None): + litellm.anthropic_key = get_settings().anthropic.key + if get_settings().get("COHERE.KEY", None): + litellm.cohere_key = get_settings().cohere.key + if get_settings().get("GROQ.KEY", None): + litellm.api_key = get_settings().groq.key + if get_settings().get("REPLICATE.KEY", None): + litellm.replicate_key = get_settings().replicate.key + if get_settings().get("HUGGINGFACE.KEY", None): + litellm.huggingface_key = get_settings().huggingface.key + if get_settings().get("HUGGINGFACE.API_BASE", None) and 'huggingface' in get_settings().config.model: + litellm.api_base = get_settings().huggingface.api_base + self.api_base = get_settings().huggingface.api_base + if get_settings().get("OLLAMA.API_BASE", None): + litellm.api_base = get_settings().ollama.api_base + self.api_base = get_settings().ollama.api_base + if get_settings().get("HUGGINGFACE.REPETITION_PENALTY", None): + self.repetition_penalty = float(get_settings().huggingface.repetition_penalty) + if get_settings().get("VERTEXAI.VERTEX_PROJECT", None): + litellm.vertex_project = get_settings().vertexai.vertex_project + litellm.vertex_location = get_settings().get( + "VERTEXAI.VERTEX_LOCATION", None + ) + def prepare_logs(self, response, system, user, resp, finish_reason): + response_log = response.dict().copy() + response_log['system'] = system + response_log['user'] = user + response_log['output'] = resp + response_log['finish_reason'] = finish_reason + if hasattr(self, 'main_pr_language'): + response_log['main_pr_language'] = self.main_pr_language + else: + response_log['main_pr_language'] = 'unknown' + return response_log + + def add_litellm_callbacks(selfs, kwargs) -> dict: + captured_extra = [] + + def capture_logs(message): + # Parsing the log message and context + record = message.record + log_entry = {} + if record.get('extra', None).get('command', None) is not None: + log_entry.update({"command": record['extra']["command"]}) + if record.get('extra', {}).get('pr_url', None) is not None: + log_entry.update({"pr_url": record['extra']["pr_url"]}) + + # Append the log entry to the captured_logs list + captured_extra.append(log_entry) + + # Adding the custom sink to Loguru + handler_id = get_logger().add(capture_logs) + get_logger().debug("Capturing logs for litellm callbacks") + get_logger().remove(handler_id) + + context = captured_extra[0] if len(captured_extra) > 0 else None + + command = context.get("command", "unknown") + pr_url = context.get("pr_url", "unknown") + git_provider = get_settings().config.git_provider + + metadata = dict() + callbacks = litellm.success_callback + litellm.failure_callback + litellm.service_callback + if "langfuse" in callbacks: + metadata.update({ + "trace_name": command, + "tags": [git_provider, command], + "trace_metadata": { + "command": command, + "pr_url": pr_url, + }, + }) + if "langsmith" in callbacks: + metadata.update({ + "run_name": command, + "tags": [git_provider, command], + "extra": { + "metadata": { + "command": command, + "pr_url": pr_url, + } + }, + }) + + # Adding the captured logs to the kwargs + kwargs["metadata"] = metadata + + return kwargs + + @property + def deployment_id(self): + """ + Returns the deployment ID for the OpenAI API. + """ + return get_settings().get("OPENAI.DEPLOYMENT_ID", None) + + @retry( + retry=retry_if_exception_type((openai.APIError, openai.APIConnectionError, openai.APITimeoutError)), # No retry on RateLimitError + stop=stop_after_attempt(OPENAI_RETRIES) + ) + async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2, img_path: str = None): + try: + resp, finish_reason = None, None + deployment_id = self.deployment_id + if self.azure: + model = 'azure/' + model + if 'claude' in model and not system: + system = "No system prompt provided" + get_logger().warning( + "Empty system prompt for claude model. Adding a newline character to prevent OpenAI API error.") + messages = [{"role": "system", "content": system}, {"role": "user", "content": user}] + if img_path: + try: + # check if the image link is alive + r = requests.head(img_path, allow_redirects=True) + if r.status_code == 404: + error_msg = f"The image link is not [alive](img_path).\nPlease repost the original image as a comment, and send the question again with 'quote reply' (see [instructions](https://pr-agent-docs.codium.ai/tools/ask/#ask-on-images-using-the-pr-code-as-context))." + get_logger().error(error_msg) + return f"{error_msg}", "error" + except Exception as e: + get_logger().error(f"Error fetching image: {img_path}", e) + return f"Error fetching image: {img_path}", "error" + messages[1]["content"] = [{"type": "text", "text": messages[1]["content"]}, + {"type": "image_url", "image_url": {"url": img_path}}] + + kwargs = { + "model": model, + "deployment_id": deployment_id, + "messages": messages, + "temperature": temperature, + "timeout": get_settings().config.ai_timeout, + "api_base": self.api_base, + } + + if get_settings().litellm.get("enable_callbacks", False): + kwargs = self.add_litellm_callbacks(kwargs) + + seed = get_settings().config.get("seed", -1) + if temperature > 0 and seed >= 0: + raise ValueError(f"Seed ({seed}) is not supported with temperature ({temperature}) > 0") + elif seed >= 0: + get_logger().info(f"Using fixed seed of {seed}") + kwargs["seed"] = seed + + if self.repetition_penalty: + kwargs["repetition_penalty"] = self.repetition_penalty + + get_logger().debug("Prompts", artifact={"system": system, "user": user}) + + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"\nSystem prompt:\n{system}") + get_logger().info(f"\nUser prompt:\n{user}") + + response = await acompletion(**kwargs) + except (openai.APIError, openai.APITimeoutError) as e: + get_logger().warning(f"Error during LLM inference: {e}") + raise + except (openai.RateLimitError) as e: + get_logger().error(f"Rate limit error during LLM inference: {e}") + raise + except (Exception) as e: + get_logger().warning(f"Unknown error during LLM inference: {e}") + raise openai.APIError from e + if response is None or len(response["choices"]) == 0: + raise openai.APIError + else: + resp = response["choices"][0]['message']['content'] + finish_reason = response["choices"][0]["finish_reason"] + get_logger().debug(f"\nAI response:\n{resp}") + + # log the full response for debugging + response_log = self.prepare_logs(response, system, user, resp, finish_reason) + get_logger().debug("Full_response", artifact=response_log) + + # for CLI debugging + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"\nAI response:\n{resp}") + + return resp, finish_reason diff --git a/pr_agent/algo/ai_handlers/openai_ai_handler.py b/pr_agent/algo/ai_handlers/openai_ai_handler.py new file mode 100644 index 000000000..999f3d3fb --- /dev/null +++ b/pr_agent/algo/ai_handlers/openai_ai_handler.py @@ -0,0 +1,68 @@ +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +import openai +from openai.error import APIError, RateLimitError, Timeout, TryAgain +from retry import retry + +from pr_agent.config_loader import get_settings +from pr_agent.log import get_logger + +OPENAI_RETRIES = 5 + + +class OpenAIHandler(BaseAiHandler): + def __init__(self): + # Initialize OpenAIHandler specific attributes here + try: + super().__init__() + openai.api_key = get_settings().openai.key + if get_settings().get("OPENAI.ORG", None): + openai.organization = get_settings().openai.org + if get_settings().get("OPENAI.API_TYPE", None): + if get_settings().openai.api_type == "azure": + self.azure = True + openai.azure_key = get_settings().openai.key + if get_settings().get("OPENAI.API_VERSION", None): + openai.api_version = get_settings().openai.api_version + if get_settings().get("OPENAI.API_BASE", None): + openai.api_base = get_settings().openai.api_base + + except AttributeError as e: + raise ValueError("OpenAI key is required") from e + + @property + def deployment_id(self): + """ + Returns the deployment ID for the OpenAI API. + """ + return get_settings().get("OPENAI.DEPLOYMENT_ID", None) + + @retry(exceptions=(APIError, Timeout, TryAgain, AttributeError, RateLimitError), + tries=OPENAI_RETRIES, delay=2, backoff=2, jitter=(1, 3)) + async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2): + try: + deployment_id = self.deployment_id + get_logger().info("System: ", system) + get_logger().info("User: ", user) + messages = [{"role": "system", "content": system}, {"role": "user", "content": user}] + + chat_completion = await openai.ChatCompletion.acreate( + model=model, + deployment_id=deployment_id, + messages=messages, + temperature=temperature, + ) + resp = chat_completion["choices"][0]['message']['content'] + finish_reason = chat_completion["choices"][0]["finish_reason"] + usage = chat_completion.get("usage") + get_logger().info("AI response", response=resp, messages=messages, finish_reason=finish_reason, + model=model, usage=usage) + return resp, finish_reason + except (APIError, Timeout, TryAgain) as e: + get_logger().error("Error during OpenAI inference: ", e) + raise + except (RateLimitError) as e: + get_logger().error("Rate limit error during OpenAI inference: ", e) + raise + except (Exception) as e: + get_logger().error("Unknown error during OpenAI inference: ", e) + raise TryAgain from e diff --git a/pr_agent/algo/file_filter.py b/pr_agent/algo/file_filter.py new file mode 100644 index 000000000..5c575eefc --- /dev/null +++ b/pr_agent/algo/file_filter.py @@ -0,0 +1,65 @@ +import fnmatch +import re + +from pr_agent.config_loader import get_settings + + +def filter_ignored(files, platform = 'github'): + """ + Filter out files that match the ignore patterns. + """ + + try: + # load regex patterns, and translate glob patterns to regex + patterns = get_settings().ignore.regex + if isinstance(patterns, str): + patterns = [patterns] + glob_setting = get_settings().ignore.glob + if isinstance(glob_setting, str): # --ignore.glob=[.*utils.py], --ignore.glob=.*utils.py + glob_setting = glob_setting.strip('[]').split(",") + patterns += [fnmatch.translate(glob) for glob in glob_setting] + + # compile all valid patterns + compiled_patterns = [] + for r in patterns: + try: + compiled_patterns.append(re.compile(r)) + except re.error: + pass + + # keep filenames that _don't_ match the ignore regex + if files and isinstance(files, list): + for r in compiled_patterns: + if platform == 'github': + files = [f for f in files if (f.filename and not r.match(f.filename))] + elif platform == 'bitbucket': + # files = [f for f in files if (f.new.path and not r.match(f.new.path))] + files_o = [] + for f in files: + if hasattr(f, 'new'): + if f.new and f.new.path and not r.match(f.new.path): + files_o.append(f) + continue + if hasattr(f, 'old'): + if f.old and f.old.path and not r.match(f.old.path): + files_o.append(f) + continue + files = files_o + elif platform == 'gitlab': + # files = [f for f in files if (f['new_path'] and not r.match(f['new_path']))] + files_o = [] + for f in files: + if 'new_path' in f and f['new_path'] and not r.match(f['new_path']): + files_o.append(f) + continue + if 'old_path' in f and f['old_path'] and not r.match(f['old_path']): + files_o.append(f) + continue + files = files_o + elif platform == 'azure': + files = [f for f in files if not r.match(f)] + + except Exception as e: + print(f"Could not filter file list: {e}") + + return files diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index 1aec00066..180d7489e 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -1,30 +1,63 @@ from __future__ import annotations -import logging import re +import traceback from pr_agent.config_loader import get_settings +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo +from pr_agent.log import get_logger -def extend_patch(original_file_str, patch_str, num_lines) -> str: - """ - Extends the given patch to include a specified number of surrounding lines. - - Args: - original_file_str (str): The original file to which the patch will be applied. - patch_str (str): The patch to be applied to the original file. - num_lines (int): The number of surrounding lines to include in the extended patch. - - Returns: - str: The extended patch string. - """ - if not patch_str or num_lines == 0: +def extend_patch(original_file_str, patch_str, patch_extra_lines_before=0, + patch_extra_lines_after=0, filename: str = "") -> str: + if not patch_str or (patch_extra_lines_before == 0 and patch_extra_lines_after == 0) or not original_file_str: + return patch_str + + original_file_str = decode_if_bytes(original_file_str) + if not original_file_str: + return patch_str + + if should_skip_patch(filename): return patch_str - if type(original_file_str) == bytes: - original_file_str = original_file_str.decode('utf-8') + try: + extended_patch_str = process_patch_lines(patch_str, original_file_str, + patch_extra_lines_before, patch_extra_lines_after) + except Exception as e: + get_logger().warning(f"Failed to extend patch: {e}", artifact={"traceback": traceback.format_exc()}) + return patch_str + + return extended_patch_str + + +def decode_if_bytes(original_file_str): + if isinstance(original_file_str, bytes): + try: + return original_file_str.decode('utf-8') + except UnicodeDecodeError: + encodings_to_try = ['iso-8859-1', 'latin-1', 'ascii', 'utf-16'] + for encoding in encodings_to_try: + try: + return original_file_str.decode(encoding) + except UnicodeDecodeError: + continue + return "" + return original_file_str + + +def should_skip_patch(filename): + patch_extension_skip_types = get_settings().config.patch_extension_skip_types + if patch_extension_skip_types and filename: + return any(filename.endswith(skip_type) for skip_type in patch_extension_skip_types) + return False + + +def process_patch_lines(patch_str, original_file_str, patch_extra_lines_before, patch_extra_lines_after): + allow_dynamic_context = get_settings().config.allow_dynamic_context + patch_extra_lines_before_dynamic = get_settings().config.max_extra_lines_before_dynamic_context original_lines = original_file_str.splitlines() + len_original_lines = len(original_lines) patch_lines = patch_str.splitlines() extended_patch_lines = [] @@ -35,43 +68,100 @@ def extend_patch(original_file_str, patch_str, num_lines) -> str: for line in patch_lines: if line.startswith('@@'): match = RE_HUNK_HEADER.match(line) + # identify hunk header if match: - # finish previous hunk - if start1 != -1: - extended_patch_lines.extend( - original_lines[start1 + size1 - 1:start1 + size1 - 1 + num_lines]) - - try: - start1, size1, start2, size2 = map(int, match.groups()[:4]) - except: # '@@ -0,0 +1 @@' case - start1, size1, size2 = map(int, match.groups()[:3]) - start2 = 0 - section_header = match.groups()[4] - extended_start1 = max(1, start1 - num_lines) - extended_size1 = size1 + (start1 - extended_start1) + num_lines - extended_start2 = max(1, start2 - num_lines) - extended_size2 = size2 + (start2 - extended_start2) + num_lines + # finish processing previous hunk + if start1 != -1 and patch_extra_lines_after > 0: + delta_lines = [f' {line}' for line in original_lines[start1 + size1 - 1:start1 + size1 - 1 + patch_extra_lines_after]] + extended_patch_lines.extend(delta_lines) + + section_header, size1, size2, start1, start2 = extract_hunk_headers(match) + + if patch_extra_lines_before > 0 or patch_extra_lines_after > 0: + def _calc_context_limits(patch_lines_before): + extended_start1 = max(1, start1 - patch_lines_before) + extended_size1 = size1 + (start1 - extended_start1) + patch_extra_lines_after + extended_start2 = max(1, start2 - patch_lines_before) + extended_size2 = size2 + (start2 - extended_start2) + patch_extra_lines_after + if extended_start1 - 1 + extended_size1 > len_original_lines: + # we cannot extend beyond the original file + delta_cap = extended_start1 - 1 + extended_size1 - len_original_lines + extended_size1 = max(extended_size1 - delta_cap, size1) + extended_size2 = max(extended_size2 - delta_cap, size2) + return extended_start1, extended_size1, extended_start2, extended_size2 + + if allow_dynamic_context: + extended_start1, extended_size1, extended_start2, extended_size2 = \ + _calc_context_limits(patch_extra_lines_before_dynamic) + lines_before = original_lines[extended_start1 - 1:start1 - 1] + found_header = False + for i, line, in enumerate(lines_before): + if section_header in line: + found_header = True + # Update start and size in one line each + extended_start1, extended_start2 = extended_start1 + i, extended_start2 + i + extended_size1, extended_size2 = extended_size1 - i, extended_size2 - i + # get_logger().debug(f"Found section header in line {i} before the hunk") + section_header = '' + break + if not found_header: + # get_logger().debug(f"Section header not found in the extra lines before the hunk") + extended_start1, extended_size1, extended_start2, extended_size2 = \ + _calc_context_limits(patch_extra_lines_before) + else: + extended_start1, extended_size1, extended_start2, extended_size2 = \ + _calc_context_limits(patch_extra_lines_before) + + delta_lines = [f' {line}' for line in original_lines[extended_start1 - 1:start1 - 1]] + + # logic to remove section header if its in the extra delta lines (in dynamic context, this is also done) + if section_header and not allow_dynamic_context: + for line in delta_lines: + if section_header in line: + section_header = '' # remove section header if it is in the extra delta lines + break + else: + extended_start1 = start1 + extended_size1 = size1 + extended_start2 = start2 + extended_size2 = size2 + delta_lines = [] + extended_patch_lines.append('') extended_patch_lines.append( f'@@ -{extended_start1},{extended_size1} ' f'+{extended_start2},{extended_size2} @@ {section_header}') - extended_patch_lines.extend( - original_lines[extended_start1 - 1:start1 - 1]) # one to zero based + extended_patch_lines.extend(delta_lines) # one to zero based continue extended_patch_lines.append(line) except Exception as e: - if get_settings().config.verbosity_level >= 2: - logging.error(f"Failed to extend patch: {e}") + get_logger().warning(f"Failed to extend patch: {e}", artifact={"traceback": traceback.format_exc()}) return patch_str - # finish previous hunk - if start1 != -1: - extended_patch_lines.extend( - original_lines[start1 + size1 - 1:start1 + size1 - 1 + num_lines]) + # finish processing last hunk + if start1 != -1 and patch_extra_lines_after > 0: + delta_lines = original_lines[start1 + size1 - 1:start1 + size1 - 1 + patch_extra_lines_after] + # add space at the beginning of each extra line + delta_lines = [f' {line}' for line in delta_lines] + extended_patch_lines.extend(delta_lines) extended_patch_str = '\n'.join(extended_patch_lines) return extended_patch_str +def extract_hunk_headers(match): + res = list(match.groups()) + for i in range(len(res)): + if res[i] is None: + res[i] = 0 + try: + start1, size1, start2, size2 = map(int, res[:4]) + except: # '@@ -0,0 +1 @@' case + start1, size1, size2 = map(int, res[:3]) + start2 = 0 + section_header = res[4] + return section_header, size1, size2, start1, start2 + + def omit_deletion_hunks(patch_lines) -> str: """ Omit deletion hunks from the patch and return the modified patch. @@ -101,9 +191,10 @@ def omit_deletion_hunks(patch_lines) -> str: inside_hunk = True else: temp_hunk.append(line) - edit_type = line[0] - if edit_type == '+': - add_hunk = True + if line: + edit_type = line[0] + if edit_type == '+': + add_hunk = True if inside_hunk and add_hunk: added_patched.extend(temp_hunk) @@ -111,7 +202,7 @@ def omit_deletion_hunks(patch_lines) -> str: def handle_patch_deletions(patch: str, original_file_content_str: str, - new_file_content_str: str, file_name: str) -> str: + new_file_content_str: str, file_name: str, edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN) -> str: """ Handle entire file or deletion patches. @@ -128,17 +219,17 @@ def handle_patch_deletions(patch: str, original_file_content_str: str, str: The modified patch with deletion hunks omitted. """ - if not new_file_content_str: + if not new_file_content_str and (edit_type == EDIT_TYPE.DELETED or edit_type == EDIT_TYPE.UNKNOWN): # logic for handling deleted files - don't show patch, just show that the file was deleted if get_settings().config.verbosity_level > 0: - logging.info(f"Processing file: {file_name}, minimizing deletion file") + get_logger().info(f"Processing file: {file_name}, minimizing deletion file") patch = None # file was deleted else: patch_lines = patch.splitlines() patch_new = omit_deletion_hunks(patch_lines) if patch != patch_new: if get_settings().config.verbosity_level > 0: - logging.info(f"Processing file: {file_name}, hunks were deleted") + get_logger().info(f"Processing file: {file_name}, hunks were deleted") patch = patch_new return patch @@ -157,7 +248,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: example output: ## src/file.ts ---new hunk-- +__new hunk__ 881 line1 882 line2 883 line3 @@ -166,7 +257,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: 889 line6 890 line7 ... ---old hunk-- +__old hunk__ line1 line2 - line3 @@ -175,9 +266,11 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: line6 ... """ - - patch_with_lines_str = f"## {file.filename}\n" - import re + # if the file was deleted, return a message indicating that the file was deleted + if hasattr(file, 'edit_type') and file.edit_type == EDIT_TYPE.DELETED: + return f"\n\n## file '{file.filename.strip()}' was deleted\n" + + patch_with_lines_str = f"\n\n## File: '{file.filename.strip()}'\n" patch_lines = patch.splitlines() RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") @@ -185,46 +278,112 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: old_content_lines = [] match = None start1, size1, start2, size2 = -1, -1, -1, -1 - for line in patch_lines: - if 'no newline at end of file' in line.lower(): + prev_header_line = [] + header_line = [] + for line_i, line in enumerate(patch_lines): + if 'no newline at end of file' in line.lower().strip().strip('//'): continue if line.startswith('@@'): + header_line = line match = RE_HUNK_HEADER.match(line) - if match and new_content_lines: # found a new hunk, split the previous lines + if match and (new_content_lines or old_content_lines): # found a new hunk, split the previous lines + if prev_header_line: + patch_with_lines_str += f'\n{prev_header_line}\n' if new_content_lines: - patch_with_lines_str += '\n--new hunk--\n' - for i, line_new in enumerate(new_content_lines): - patch_with_lines_str += f"{start2 + i} {line_new}\n" + is_plus_lines = any([line.startswith('+') for line in new_content_lines]) + if is_plus_lines: + patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n' + for i, line_new in enumerate(new_content_lines): + patch_with_lines_str += f"{start2 + i} {line_new}\n" if old_content_lines: - patch_with_lines_str += '--old hunk--\n' - for line_old in old_content_lines: - patch_with_lines_str += f"{line_old}\n" + is_minus_lines = any([line.startswith('-') for line in old_content_lines]) + if is_minus_lines: + patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n' + for line_old in old_content_lines: + patch_with_lines_str += f"{line_old}\n" new_content_lines = [] old_content_lines = [] - try: - start1, size1, start2, size2 = map(int, match.groups()[:4]) - except: # '@@ -0,0 +1 @@' case - start1, size1, size2 = map(int, match.groups()[:3]) - start2 = 0 + if match: + prev_header_line = header_line + + section_header, size1, size2, start1, start2 = extract_hunk_headers(match) elif line.startswith('+'): new_content_lines.append(line) elif line.startswith('-'): old_content_lines.append(line) else: + if not line and line_i: # if this line is empty and the next line is a hunk header, skip it + if line_i + 1 < len(patch_lines) and patch_lines[line_i + 1].startswith('@@'): + continue + elif line_i + 1 == len(patch_lines): + continue new_content_lines.append(line) old_content_lines.append(line) # finishing last hunk if match and new_content_lines: + patch_with_lines_str += f'\n{header_line}\n' if new_content_lines: - patch_with_lines_str += '\n--new hunk--\n' - for i, line_new in enumerate(new_content_lines): - patch_with_lines_str += f"{start2 + i} {line_new}\n" + is_plus_lines = any([line.startswith('+') for line in new_content_lines]) + if is_plus_lines: + patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n' + for i, line_new in enumerate(new_content_lines): + patch_with_lines_str += f"{start2 + i} {line_new}\n" if old_content_lines: - patch_with_lines_str += '\n--old hunk--\n' - for line_old in old_content_lines: - patch_with_lines_str += f"{line_old}\n" + is_minus_lines = any([line.startswith('-') for line in old_content_lines]) + if is_minus_lines: + patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n' + for line_old in old_content_lines: + patch_with_lines_str += f"{line_old}\n" + + return patch_with_lines_str.rstrip() + + +def extract_hunk_lines_from_patch(patch: str, file_name, line_start, line_end, side) -> tuple[str, str]: + + patch_with_lines_str = f"\n\n## File: '{file_name.strip()}'\n\n" + selected_lines = "" + patch_lines = patch.splitlines() + RE_HUNK_HEADER = re.compile( + r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") + match = None + start1, size1, start2, size2 = -1, -1, -1, -1 + skip_hunk = False + selected_lines_num = 0 + for line in patch_lines: + if 'no newline at end of file' in line.lower(): + continue + + if line.startswith('@@'): + skip_hunk = False + selected_lines_num = 0 + header_line = line + + match = RE_HUNK_HEADER.match(line) + + section_header, size1, size2, start1, start2 = extract_hunk_headers(match) + + # check if line range is in this hunk + if side.lower() == 'left': + # check if line range is in this hunk + if not (start1 <= line_start <= start1 + size1): + skip_hunk = True + continue + elif side.lower() == 'right': + if not (start2 <= line_start <= start2 + size2): + skip_hunk = True + continue + patch_with_lines_str += f'\n{header_line}\n' + + elif not skip_hunk: + if side.lower() == 'right' and line_start <= start2 + selected_lines_num <= line_end: + selected_lines += line + '\n' + if side.lower() == 'left' and start1 <= selected_lines_num + start1 <= line_end: + selected_lines += line + '\n' + patch_with_lines_str += line + '\n' + if not line.startswith('-'): # currently we don't support /ask line for deleted lines + selected_lines_num += 1 - return patch_with_lines_str.strip() + return patch_with_lines_str.rstrip(), selected_lines.rstrip() \ No newline at end of file diff --git a/pr_agent/algo/language_handler.py b/pr_agent/algo/language_handler.py index 586a31619..7c1039986 100644 --- a/pr_agent/algo/language_handler.py +++ b/pr_agent/algo/language_handler.py @@ -3,20 +3,24 @@ from pr_agent.config_loader import get_settings -language_extension_map_org = get_settings().language_extension_map_org -language_extension_map = {k.lower(): v for k, v in language_extension_map_org.items()} -# Bad Extensions, source: https://github.com/EleutherAI/github-downloader/blob/345e7c4cbb9e0dc8a0615fd995a08bf9d73b3fe6/download_repo_text.py # noqa: E501 -bad_extensions = get_settings().bad_extensions.default -if get_settings().config.use_extra_bad_extensions: - bad_extensions += get_settings().bad_extensions.extra def filter_bad_extensions(files): - return [f for f in files if f.filename is not None and is_valid_file(f.filename)] + # Bad Extensions, source: https://github.com/EleutherAI/github-downloader/blob/345e7c4cbb9e0dc8a0615fd995a08bf9d73b3fe6/download_repo_text.py # noqa: E501 + bad_extensions = get_settings().bad_extensions.default + if get_settings().config.use_extra_bad_extensions: + bad_extensions += get_settings().bad_extensions.extra + return [f for f in files if f.filename is not None and is_valid_file(f.filename, bad_extensions)] -def is_valid_file(filename): +def is_valid_file(filename:str, bad_extensions=None) -> bool: + if not filename: + return False + if not bad_extensions: + bad_extensions = get_settings().bad_extensions.default + if get_settings().config.use_extra_bad_extensions: + bad_extensions += get_settings().bad_extensions.extra return filename.split('.')[-1] not in bad_extensions @@ -29,6 +33,8 @@ def sort_files_by_main_languages(languages: Dict, files: list): # languages_sorted = sorted(languages, key=lambda x: x[1], reverse=True) # get all extensions for the languages main_extensions = [] + language_extension_map_org = get_settings().language_extension_map_org + language_extension_map = {k.lower(): v for k, v in language_extension_map_org.items()} for language in languages_sorted_list: if language.lower() in language_extension_map: main_extensions.append(language_extension_map[language.lower()]) @@ -42,6 +48,11 @@ def sort_files_by_main_languages(languages: Dict, files: list): files_sorted = [] rest_files = {} + # if no languages detected, put all files in the "Other" category + if not languages: + files_sorted = [({"language": "Other", "files": list(files_filtered)})] + return files_sorted + main_extensions_flat = [] for ext in main_extensions: main_extensions_flat.extend(ext) diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 3a08a86db..95d2fda72 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -1,102 +1,196 @@ from __future__ import annotations -import difflib -import logging -import re import traceback -from typing import Any, Callable, List, Tuple +from typing import Callable, List, Tuple from github import RateLimitExceededException -from pr_agent.algo import MAX_TOKENS from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, extend_patch, handle_patch_deletions from pr_agent.algo.language_handler import sort_files_by_main_languages -from pr_agent.algo.token_handler import TokenHandler, get_token_encoder +from pr_agent.algo.file_filter import filter_ignored +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import get_max_tokens, clip_tokens, ModelType from pr_agent.config_loader import get_settings -from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider +from pr_agent.git_providers.git_provider import GitProvider +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo +from pr_agent.log import get_logger DELETED_FILES_ = "Deleted files:\n" -MORE_MODIFIED_FILES_ = "More modified files:\n" +MORE_MODIFIED_FILES_ = "Additional modified files (insufficient token budget to process):\n" -OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000 -OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600 -PATCH_EXTRA_LINES = 3 +ADDED_FILES_ = "Additional added files (insufficient token budget to process):\n" -def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str, - add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str: - """ - Returns a string with the diff of the pull request, applying diff minimization techniques if needed. +OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1500 +OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 1000 +MAX_EXTRA_LINES = 10 - Args: - git_provider (GitProvider): An object of the GitProvider class representing the Git provider used for the pull - request. - token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the - pull request. - model (str): The name of the model used for tokenization. - add_line_numbers_to_hunks (bool, optional): A boolean indicating whether to add line numbers to the hunks in the - diff. Defaults to False. - disable_extra_lines (bool, optional): A boolean indicating whether to disable the extension of each patch with - extra lines of context. Defaults to False. - Returns: - str: A string with the diff of the pull request, applying diff minimization techniques if needed. - """ +def cap_and_log_extra_lines(value, direction) -> int: + if value > MAX_EXTRA_LINES: + get_logger().warning(f"patch_extra_lines_{direction} was {value}, capping to {MAX_EXTRA_LINES}") + return MAX_EXTRA_LINES + return value + +def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, + model: str, + add_line_numbers_to_hunks: bool = False, + disable_extra_lines: bool = False, + large_pr_handling=False, + return_remaining_files=False): if disable_extra_lines: - global PATCH_EXTRA_LINES - PATCH_EXTRA_LINES = 0 + PATCH_EXTRA_LINES_BEFORE = 0 + PATCH_EXTRA_LINES_AFTER = 0 + else: + PATCH_EXTRA_LINES_BEFORE = get_settings().config.patch_extra_lines_before + PATCH_EXTRA_LINES_AFTER = get_settings().config.patch_extra_lines_after + PATCH_EXTRA_LINES_BEFORE = cap_and_log_extra_lines(PATCH_EXTRA_LINES_BEFORE, "before") + PATCH_EXTRA_LINES_AFTER = cap_and_log_extra_lines(PATCH_EXTRA_LINES_AFTER, "after") try: - diff_files = git_provider.get_diff_files() + diff_files_original = git_provider.get_diff_files() except RateLimitExceededException as e: - logging.error(f"Rate limit exceeded for git provider API. original message {e}") + get_logger().error(f"Rate limit exceeded for git provider API. original message {e}") raise + diff_files = filter_ignored(diff_files_original) + if diff_files != diff_files_original: + try: + get_logger().info(f"Filtered out {len(diff_files_original) - len(diff_files)} files") + new_names = set([a.filename for a in diff_files]) + orig_names = set([a.filename for a in diff_files_original]) + get_logger().info(f"Filtered out files: {orig_names - new_names}") + except Exception as e: + pass + + # get pr languages pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) + if pr_languages: + try: + get_logger().info(f"PR main language: {pr_languages[0]['language']}") + except Exception as e: + pass # generate a standard diff string, with patch extension - patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler, - add_line_numbers_to_hunks) + patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff( + pr_languages, token_handler, add_line_numbers_to_hunks, + patch_extra_lines_before=PATCH_EXTRA_LINES_BEFORE, patch_extra_lines_after=PATCH_EXTRA_LINES_AFTER) # if we are under the limit, return the full diff - if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < MAX_TOKENS[model]: + if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model): + get_logger().info(f"Tokens: {total_tokens}, total tokens under limit: {get_max_tokens(model)}, " + f"returning full diff.") return "\n".join(patches_extended) - # if we are over the limit, start pruning - patches_compressed, modified_file_names, deleted_file_names = \ - pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks) + # if we are over the limit, start pruning (If we got here, we will not extend the patches with extra lines) + get_logger().info(f"Tokens: {total_tokens}, total tokens over limit: {get_max_tokens(model)}, " + f"pruning diff.") + patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \ + pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks, large_pr_handling) + + if large_pr_handling and len(patches_compressed_list) > 1: + get_logger().info(f"Large PR handling mode, and found {len(patches_compressed_list)} patches with original diff.") + return "" # return empty string, as we want to generate multiple patches with a different prompt + + # return the first patch + patches_compressed = patches_compressed_list[0] + total_tokens_new = total_tokens_list[0] + files_in_patch = files_in_patches_list[0] + # Insert additional information about added, modified, and deleted files if there is enough space + max_tokens = get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD + curr_token = total_tokens_new # == token_handler.count_tokens(final_diff)+token_handler.prompt_tokens final_diff = "\n".join(patches_compressed) - if modified_file_names: - modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) + delta_tokens = 10 + added_list_str = modified_list_str = deleted_list_str = "" + unprocessed_files = [] + # generate the added, modified, and deleted files lists + if (max_tokens - curr_token) > delta_tokens: + for filename, file_values in file_dict.items(): + if filename in files_in_patch: + continue + if file_values['edit_type'] == EDIT_TYPE.ADDED: + unprocessed_files.append(filename) + if not added_list_str: + added_list_str = ADDED_FILES_ + f"\n{filename}" + else: + added_list_str = added_list_str + f"\n{filename}" + elif file_values['edit_type'] in [EDIT_TYPE.MODIFIED, EDIT_TYPE.RENAMED]: + unprocessed_files.append(filename) + if not modified_list_str: + modified_list_str = MORE_MODIFIED_FILES_ + f"\n{filename}" + else: + modified_list_str = modified_list_str + f"\n{filename}" + elif file_values['edit_type'] == EDIT_TYPE.DELETED: + # unprocessed_files.append(filename) # not needed here, because the file was deleted, so no need to process it + if not deleted_list_str: + deleted_list_str = DELETED_FILES_ + f"\n{filename}" + else: + deleted_list_str = deleted_list_str + f"\n{filename}" + + # prune the added, modified, and deleted files lists, and add them to the final diff + added_list_str = clip_tokens(added_list_str, max_tokens - curr_token) + if added_list_str: + final_diff = final_diff + "\n\n" + added_list_str + curr_token += token_handler.count_tokens(added_list_str) + 2 + modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token) + if modified_list_str: final_diff = final_diff + "\n\n" + modified_list_str - if deleted_file_names: - deleted_list_str = DELETED_FILES_ + "\n".join(deleted_file_names) + curr_token += token_handler.count_tokens(modified_list_str) + 2 + deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token) + if deleted_list_str: final_diff = final_diff + "\n\n" + deleted_list_str - return final_diff + get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, " + f"deleted_list_str: {deleted_list_str}") + if not return_remaining_files: + return final_diff + else: + return final_diff, remaining_files_list -def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, - add_line_numbers_to_hunks: bool) -> \ - Tuple[list, int]: - """ - Generate a standard diff string with patch extension, while counting the number of tokens used and applying diff - minimization techniques if needed. - Args: - - pr_languages: A list of dictionaries representing the languages used in the pull request and their corresponding - files. - - token_handler: An object of the TokenHandler class used for handling tokens in the context of the pull request. - - add_line_numbers_to_hunks: A boolean indicating whether to add line numbers to the hunks in the diff. +def get_pr_diff_multiple_patchs(git_provider: GitProvider, token_handler: TokenHandler, model: str, + add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False): + try: + diff_files_original = git_provider.get_diff_files() + except RateLimitExceededException as e: + get_logger().error(f"Rate limit exceeded for git provider API. original message {e}") + raise - Returns: - - patches_extended: A list of extended patches for each file in the pull request. - - total_tokens: The total number of tokens used in the extended patches. - """ + diff_files = filter_ignored(diff_files_original) + if diff_files != diff_files_original: + try: + get_logger().info(f"Filtered out {len(diff_files_original) - len(diff_files)} files") + new_names = set([a.filename for a in diff_files]) + orig_names = set([a.filename for a in diff_files_original]) + get_logger().info(f"Filtered out files: {orig_names - new_names}") + except Exception as e: + pass + + # get pr languages + pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) + if pr_languages: + try: + get_logger().info(f"PR main language: {pr_languages[0]['language']}") + except Exception as e: + pass + + patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \ + pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks, large_pr_handling=True) + + return patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list + + +def pr_generate_extended_diff(pr_languages: list, + token_handler: TokenHandler, + add_line_numbers_to_hunks: bool, + patch_extra_lines_before: int = 0, + patch_extra_lines_after: int = 0) -> Tuple[list, int, list]: total_tokens = token_handler.prompt_tokens # initial tokens patches_extended = [] + patches_extended_tokens = [] for lang in pr_languages: for file in lang['files']: original_file_content_str = file.base_file @@ -105,55 +199,41 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, continue # extend each patch with extra lines of context - extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES) - full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n" + extended_patch = extend_patch(original_file_content_str, patch, + patch_extra_lines_before, patch_extra_lines_after, file.filename) + if not extended_patch: + get_logger().warning(f"Failed to extend patch for file: {file.filename}") + continue + full_extended_patch = f"\n\n## {file.filename}\n{extended_patch.rstrip()}\n" if add_line_numbers_to_hunks: full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file) + # add AI-summary metadata to the patch + if file.ai_file_summary and get_settings().get("config.enable_ai_metadata", False): + full_extended_patch = add_ai_summary_top_patch(file, full_extended_patch) + patch_tokens = token_handler.count_tokens(full_extended_patch) file.tokens = patch_tokens total_tokens += patch_tokens + patches_extended_tokens.append(patch_tokens) patches_extended.append(full_extended_patch) - return patches_extended, total_tokens + return patches_extended, total_tokens, patches_extended_tokens def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str, - convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list]: - """ - Generate a compressed diff string for a pull request, using diff minimization techniques to reduce the number of - tokens used. - Args: - top_langs (list): A list of dictionaries representing the languages used in the pull request and their - corresponding files. - token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the - pull request. - model (str): The model used for tokenization. - convert_hunks_to_line_numbers (bool): A boolean indicating whether to convert hunks to line numbers in the diff. - Returns: - Tuple[list, list, list]: A tuple containing the following lists: - - patches: A list of compressed diff patches for each file in the pull request. - - modified_files_list: A list of file names that were skipped due to large patch size. - - deleted_files_list: A list of file names that were deleted in the pull request. - - Minimization techniques to reduce the number of tokens: - 0. Start from the largest diff patch to smaller ones - 1. Don't use extend context lines around diff - 2. Minimize deleted files - 3. Minimize deleted hunks - 4. Minimize all remaining files when you reach token limit - """ - - patches = [] - modified_files_list = [] + convert_hunks_to_line_numbers: bool, + large_pr_handling: bool) -> Tuple[list, list, list, list, dict, list]: deleted_files_list = [] + # sort each one of the languages in top_langs by the number of tokens in the diff sorted_files = [] for lang in top_langs: sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True)) - total_tokens = token_handler.prompt_tokens + # generate patches for each file, and count tokens + file_dict = {} for file in sorted_files: original_file_content_str = file.base_file new_file_content_str = file.head_file @@ -163,151 +243,306 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo # removing delete-only hunks patch = handle_patch_deletions(patch, original_file_content_str, - new_file_content_str, file.filename) + new_file_content_str, file.filename, file.edit_type) if patch is None: - if not deleted_files_list: - total_tokens += token_handler.count_tokens(DELETED_FILES_) - deleted_files_list.append(file.filename) - total_tokens += token_handler.count_tokens(file.filename) + 1 + if file.filename not in deleted_files_list: + deleted_files_list.append(file.filename) continue if convert_hunks_to_line_numbers: patch = convert_to_hunks_with_lines_numbers(patch, file) + ## add AI-summary metadata to the patch (disabled, since we are in the compressed diff) + # if file.ai_file_summary and get_settings().config.get('config.is_auto_command', False): + # patch = add_ai_summary_top_patch(file, patch) + new_patch_tokens = token_handler.count_tokens(patch) + file_dict[file.filename] = {'patch': patch, 'tokens': new_patch_tokens, 'edit_type': file.edit_type} + + max_tokens_model = get_max_tokens(model) + + # first iteration + files_in_patches_list = [] + remaining_files_list = [file.filename for file in sorted_files] + patches_list =[] + total_tokens_list = [] + total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, file_dict, + max_tokens_model, remaining_files_list, token_handler) + patches_list.append(patches) + total_tokens_list.append(total_tokens) + files_in_patches_list.append(files_in_patch_list) + + # additional iterations (if needed) + if large_pr_handling: + NUMBER_OF_ALLOWED_ITERATIONS = get_settings().pr_description.max_ai_calls - 1 # one more call is to summarize + for i in range(NUMBER_OF_ALLOWED_ITERATIONS-1): + if remaining_files_list: + total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, + file_dict, + max_tokens_model, + remaining_files_list, token_handler) + if patches: + patches_list.append(patches) + total_tokens_list.append(total_tokens) + files_in_patches_list.append(files_in_patch_list) + else: + break + + return patches_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list + + +def generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_model,remaining_files_list_prev, token_handler): + total_tokens = token_handler.prompt_tokens # initial tokens + patches = [] + remaining_files_list_new = [] + files_in_patch_list = [] + for filename, data in file_dict.items(): + if filename not in remaining_files_list_prev: + continue + + patch = data['patch'] + new_patch_tokens = data['tokens'] + edit_type = data['edit_type'] # Hard Stop, no more tokens - if total_tokens > MAX_TOKENS[model] - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: - logging.warning(f"File was fully skipped, no more tokens: {file.filename}.") + if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: + get_logger().warning(f"File was fully skipped, no more tokens: {filename}.") continue # If the patch is too large, just show the file name - if total_tokens + new_patch_tokens > MAX_TOKENS[model] - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: + if total_tokens + new_patch_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: # Current logic is to skip the patch if it's too large # TODO: Option for alternative logic to remove hunks from the patch to reduce the number of tokens # until we meet the requirements if get_settings().config.verbosity_level >= 2: - logging.warning(f"Patch too large, minimizing it, {file.filename}") - if not modified_files_list: - total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_) - modified_files_list.append(file.filename) - total_tokens += token_handler.count_tokens(file.filename) + 1 + get_logger().warning(f"Patch too large, skipping it, {filename}") + remaining_files_list_new.append(filename) continue if patch: if not convert_hunks_to_line_numbers: - patch_final = f"## {file.filename}\n\n{patch}\n" + patch_final = f"\n\n## File: '{filename.strip()}\n\n{patch.strip()}\n'" else: - patch_final = patch + patch_final = "\n\n" + patch.strip() patches.append(patch_final) total_tokens += token_handler.count_tokens(patch_final) + files_in_patch_list.append(filename) if get_settings().config.verbosity_level >= 2: - logging.info(f"Tokens: {total_tokens}, last filename: {file.filename}") + get_logger().info(f"Tokens: {total_tokens}, last filename: {filename}") + return total_tokens, patches, remaining_files_list_new, files_in_patch_list - return patches, modified_files_list, deleted_files_list - -async def retry_with_fallback_models(f: Callable): - model = get_settings().config.model - fallback_models = get_settings().config.fallback_models - if not isinstance(fallback_models, list): - fallback_models = [fallback_models] - all_models = [model] + fallback_models - for i, model in enumerate(all_models): +async def retry_with_fallback_models(f: Callable, model_type: ModelType = ModelType.REGULAR): + all_models = _get_all_models(model_type) + all_deployments = _get_all_deployments(all_models) + # try each (model, deployment_id) pair until one is successful, otherwise raise exception + for i, (model, deployment_id) in enumerate(zip(all_models, all_deployments)): try: + get_logger().debug( + f"Generating prediction with {model}" + f"{(' from deployment ' + deployment_id) if deployment_id else ''}" + ) + get_settings().set("openai.deployment_id", deployment_id) return await f(model) - except Exception as e: - logging.warning(f"Failed to generate prediction with {model}: {traceback.format_exc()}") + except: + get_logger().warning( + f"Failed to generate prediction with {model}" + ) if i == len(all_models) - 1: # If it's the last iteration - raise # Re-raise the last exception + raise Exception(f"Failed to generate prediction with any model of {all_models}") -def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo], - relevant_file: str, - relevant_line_in_file: str) -> Tuple[int, int]: +def _get_all_models(model_type: ModelType = ModelType.REGULAR) -> List[str]: + if model_type == ModelType.TURBO: + model = get_settings().config.model_turbo + else: + model = get_settings().config.model + fallback_models = get_settings().config.fallback_models + if not isinstance(fallback_models, list): + fallback_models = [m.strip() for m in fallback_models.split(",")] + all_models = [model] + fallback_models + return all_models + + +def _get_all_deployments(all_models: List[str]) -> List[str]: + deployment_id = get_settings().get("openai.deployment_id", None) + fallback_deployments = get_settings().get("openai.fallback_deployments", []) + if not isinstance(fallback_deployments, list) and fallback_deployments: + fallback_deployments = [d.strip() for d in fallback_deployments.split(",")] + if fallback_deployments: + all_deployments = [deployment_id] + fallback_deployments + if len(all_deployments) < len(all_models): + raise ValueError(f"The number of deployments ({len(all_deployments)}) " + f"is less than the number of models ({len(all_models)})") + else: + all_deployments = [deployment_id] * len(all_models) + return all_deployments + + +def get_pr_multi_diffs(git_provider: GitProvider, + token_handler: TokenHandler, + model: str, + max_calls: int = 5) -> List[str]: """ - Find the line number and absolute position of a relevant line in a file. + Retrieves the diff files from a Git provider, sorts them by main language, and generates patches for each file. + The patches are split into multiple groups based on the maximum number of tokens allowed for the given model. Args: - diff_files (List[FilePatchInfo]): A list of FilePatchInfo objects representing the patches of files. - relevant_file (str): The name of the file where the relevant line is located. - relevant_line_in_file (str): The content of the relevant line. + git_provider (GitProvider): An object that provides access to Git provider APIs. + token_handler (TokenHandler): An object that handles tokens in the context of a pull request. + model (str): The name of the model. + max_calls (int, optional): The maximum number of calls to retrieve diff files. Defaults to 5. Returns: - Tuple[int, int]: A tuple containing the line number and absolute position of the relevant line in the file. - """ - position = -1 - absolute_position = -1 - re_hunk_header = re.compile( - r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") + List[str]: A list of final diff strings, split into multiple groups based on the maximum number of tokens allowed for the given model. - for file in diff_files: - if file.filename.strip() == relevant_file: - patch = file.patch - patch_lines = patch.splitlines() - - # try to find the line in the patch using difflib, with some margin of error - matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file, - patch_lines, n=3, cutoff=0.93) - if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'): - relevant_line_in_file = matches_difflib[0] - - delta = 0 - start1, size1, start2, size2 = 0, 0, 0, 0 - for i, line in enumerate(patch_lines): - if line.startswith('@@'): - delta = 0 - match = re_hunk_header.match(line) - start1, size1, start2, size2 = map(int, match.groups()[:4]) - elif not line.startswith('-'): - delta += 1 - - if relevant_line_in_file in line and line[0] != '-': - position = i - absolute_position = start2 + delta - 1 - break - - if position == -1 and relevant_line_in_file[0] == '+': - no_plus_line = relevant_line_in_file[1:].lstrip() - for i, line in enumerate(patch_lines): - if line.startswith('@@'): - delta = 0 - match = re_hunk_header.match(line) - start1, size1, start2, size2 = map(int, match.groups()[:4]) - elif not line.startswith('-'): - delta += 1 - - if no_plus_line in line and line[0] != '-': - # The model might add a '+' to the beginning of the relevant_line_in_file even if originally - # it's a context line - position = i - absolute_position = start2 + delta - 1 - break - return position, absolute_position - - -def clip_tokens(text: str, max_tokens: int) -> str: + Raises: + RateLimitExceededException: If the rate limit for the Git provider API is exceeded. """ - Clip the number of tokens in a string to a maximum number of tokens. + try: + diff_files = git_provider.get_diff_files() + except RateLimitExceededException as e: + get_logger().error(f"Rate limit exceeded for git provider API. original message {e}") + raise - Args: - text (str): The string to clip. - max_tokens (int): The maximum number of tokens allowed in the string. + diff_files = filter_ignored(diff_files) - Returns: - str: The clipped string. + # Sort files by main language + pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) + + # Sort files within each language group by tokens in descending order + sorted_files = [] + for lang in pr_languages: + sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True)) + + # Get the maximum number of extra lines before and after the patch + PATCH_EXTRA_LINES_BEFORE = get_settings().config.patch_extra_lines_before + PATCH_EXTRA_LINES_AFTER = get_settings().config.patch_extra_lines_after + PATCH_EXTRA_LINES_BEFORE = cap_and_log_extra_lines(PATCH_EXTRA_LINES_BEFORE, "before") + PATCH_EXTRA_LINES_AFTER = cap_and_log_extra_lines(PATCH_EXTRA_LINES_AFTER, "after") + + # try first a single run with standard diff string, with patch extension, and no deletions + patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff( + pr_languages, token_handler, add_line_numbers_to_hunks=True, + patch_extra_lines_before=PATCH_EXTRA_LINES_BEFORE, + patch_extra_lines_after=PATCH_EXTRA_LINES_AFTER) + + # if we are under the limit, return the full diff + if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model): + return ["\n".join(patches_extended)] if patches_extended else [] + + patches = [] + final_diff_list = [] + total_tokens = token_handler.prompt_tokens + call_number = 1 + for file in sorted_files: + if call_number > max_calls: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Reached max calls ({max_calls})") + break + + original_file_content_str = file.base_file + new_file_content_str = file.head_file + patch = file.patch + if not patch: + continue + + # Remove delete-only hunks + patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename, file.edit_type) + if patch is None: + continue + + patch = convert_to_hunks_with_lines_numbers(patch, file) + # add AI-summary metadata to the patch + if file.ai_file_summary and get_settings().get("config.enable_ai_metadata", False): + patch = add_ai_summary_top_patch(file, patch) + new_patch_tokens = token_handler.count_tokens(patch) + + if patch and (token_handler.prompt_tokens + new_patch_tokens) > get_max_tokens( + model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: + if get_settings().config.get('large_patch_policy', 'skip') == 'skip': + get_logger().warning(f"Patch too large, skipping: {file.filename}") + continue + elif get_settings().config.get('large_patch_policy') == 'clip': + delta_tokens = get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD - token_handler.prompt_tokens + patch_clipped = clip_tokens(patch, delta_tokens, delete_last_line=True, num_input_tokens=new_patch_tokens) + new_patch_tokens = token_handler.count_tokens(patch_clipped) + if patch_clipped and (token_handler.prompt_tokens + new_patch_tokens) > get_max_tokens( + model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: + get_logger().warning(f"Patch too large, skipping: {file.filename}") + continue + else: + get_logger().info(f"Clipped large patch for file: {file.filename}") + patch = patch_clipped + else: + get_logger().warning(f"Patch too large, skipping: {file.filename}") + continue + + if patch and (total_tokens + new_patch_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD): + final_diff = "\n".join(patches) + final_diff_list.append(final_diff) + patches = [] + total_tokens = token_handler.prompt_tokens + call_number += 1 + if call_number > max_calls: # avoid creating new patches + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Reached max calls ({max_calls})") + break + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Call number: {call_number}") + + if patch: + patches.append(patch) + total_tokens += new_patch_tokens + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Tokens: {total_tokens}, last filename: {file.filename}") + + # Add the last chunk + if patches: + final_diff = "\n".join(patches) + final_diff_list.append(final_diff) + + return final_diff_list + + +def add_ai_metadata_to_diff_files(git_provider, pr_description_files): """ - # We'll estimate the number of tokens by hueristically assuming 2.5 tokens per word + Adds AI metadata to the diff files based on the PR description files (FilePatchInfo.ai_file_summary). + """ + try: + if not pr_description_files: + get_logger().warning(f"PR description files are empty.") + return + available_files = {pr_file['full_file_name'].strip(): pr_file for pr_file in pr_description_files} + diff_files = git_provider.get_diff_files() + found_any_match = False + for file in diff_files: + filename = file.filename.strip() + if filename in available_files: + file.ai_file_summary = available_files[filename] + found_any_match = True + if not found_any_match: + get_logger().error(f"Failed to find any matching files between PR description and diff files.", + artifact={"pr_description_files": pr_description_files}) + except Exception as e: + get_logger().error(f"Failed to add AI metadata to diff files: {e}", + artifact={"traceback": traceback.format_exc()}) + + +def add_ai_summary_top_patch(file, full_extended_patch): try: - encoder = get_token_encoder() - num_input_tokens = len(encoder.encode(text)) - if num_input_tokens <= max_tokens: - return text - num_chars = len(text) - chars_per_token = num_chars / num_input_tokens - num_output_chars = int(chars_per_token * max_tokens) - clipped_text = text[:num_output_chars] - return clipped_text + # below every instance of '## File: ...' in the patch, add the ai-summary metadata + full_extended_patch_lines = full_extended_patch.split("\n") + for i, line in enumerate(full_extended_patch_lines): + if line.startswith("## File:") or line.startswith("## file:"): + full_extended_patch_lines.insert(i + 1, + f"### AI-generated changes summary:\n{file.ai_file_summary['long_summary']}") + full_extended_patch = "\n".join(full_extended_patch_lines) + return full_extended_patch + + # if no '## File: ...' was found + return full_extended_patch except Exception as e: - logging.warning(f"Failed to clip tokens: {e}") - return text \ No newline at end of file + get_logger().error(f"Failed to add AI summary to the top of the patch: {e}", + artifact={"traceback": traceback.format_exc()}) + return full_extended_patch diff --git a/pr_agent/algo/token_handler.py b/pr_agent/algo/token_handler.py index f018a92b0..935e6da5f 100644 --- a/pr_agent/algo/token_handler.py +++ b/pr_agent/algo/token_handler.py @@ -1,12 +1,27 @@ from jinja2 import Environment, StrictUndefined from tiktoken import encoding_for_model, get_encoding - from pr_agent.config_loader import get_settings +from threading import Lock + +from pr_agent.log import get_logger + + +class TokenEncoder: + _encoder_instance = None + _model = None + _lock = Lock() # Create a lock object + @classmethod + def get_token_encoder(cls): + model = get_settings().config.model + if cls._encoder_instance is None or model != cls._model: # Check without acquiring the lock for performance + with cls._lock: # Lock acquisition to ensure thread safety + if cls._encoder_instance is None or model != cls._model: + cls._model = model + cls._encoder_instance = encoding_for_model(cls._model) if "gpt" in cls._model else get_encoding( + "cl100k_base") + return cls._encoder_instance -def get_token_encoder(): - return encoding_for_model(get_settings().config.model) if "gpt" in get_settings().config.model else get_encoding( - "cl100k_base") class TokenHandler: """ @@ -21,7 +36,7 @@ class TokenHandler: method. """ - def __init__(self, pr, vars: dict, system, user): + def __init__(self, pr=None, vars: dict = {}, system="", user=""): """ Initializes the TokenHandler object. @@ -31,8 +46,9 @@ def __init__(self, pr, vars: dict, system, user): - system: The system string. - user: The user string. """ - self.encoder = get_token_encoder() - self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user) + self.encoder = TokenEncoder.get_token_encoder() + if pr is not None: + self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user) def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user): """ @@ -48,12 +64,16 @@ def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user): Returns: The sum of the number of tokens in the system and user strings. """ - environment = Environment(undefined=StrictUndefined) - system_prompt = environment.from_string(system).render(vars) - user_prompt = environment.from_string(user).render(vars) - system_prompt_tokens = len(encoder.encode(system_prompt)) - user_prompt_tokens = len(encoder.encode(user_prompt)) - return system_prompt_tokens + user_prompt_tokens + try: + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(system).render(vars) + user_prompt = environment.from_string(user).render(vars) + system_prompt_tokens = len(encoder.encode(system_prompt)) + user_prompt_tokens = len(encoder.encode(user_prompt)) + return system_prompt_tokens + user_prompt_tokens + except Exception as e: + get_logger().error(f"Error in _get_system_user_tokens: {e}") + return 0 def count_tokens(self, patch: str) -> int: """ diff --git a/pr_agent/algo/types.py b/pr_agent/algo/types.py new file mode 100644 index 000000000..bf2fc1afc --- /dev/null +++ b/pr_agent/algo/types.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass +from enum import Enum + + +class EDIT_TYPE(Enum): + ADDED = 1 + DELETED = 2 + MODIFIED = 3 + RENAMED = 4 + UNKNOWN = 5 + + +@dataclass +class FilePatchInfo: + base_file: str + head_file: str + patch: str + filename: str + tokens: int = -1 + edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN + old_filename: str = None + num_plus_lines: int = -1 + num_minus_lines: int = -1 + ai_file_summary: str = None diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py index 725d75ec5..08401514a 100644 --- a/pr_agent/algo/utils.py +++ b/pr_agent/algo/utils.py @@ -1,16 +1,42 @@ from __future__ import annotations +import html2text +import html +import copy import difflib import json -import logging +import os import re import textwrap +import time from datetime import datetime -from typing import Any, List +from enum import Enum +from typing import Any, List, Tuple import yaml +from pydantic import BaseModel from starlette_context import context + +from pr_agent.algo import MAX_TOKENS +from pr_agent.algo.token_handler import TokenEncoder from pr_agent.config_loader import get_settings, global_settings +from pr_agent.algo.types import FilePatchInfo +from pr_agent.log import get_logger + +class Range(BaseModel): + line_start: int # should be 0-indexed + line_end: int + column_start: int = -1 + column_end: int = -1 + +class ModelType(str, Enum): + REGULAR = "regular" + TURBO = "turbo" + + +class PRReviewHeader(str, Enum): + REGULAR = "## PR Reviewer Guide" + INCREMENTAL = "## Incremental PR Reviewer Guide" def get_setting(key: str) -> Any: @@ -20,76 +46,326 @@ def get_setting(key: str) -> Any: except Exception: return global_settings.get(key, None) -def convert_to_markdown(output_data: dict) -> str: + +def emphasize_header(text: str, only_markdown=False, reference_link=None) -> str: + try: + # Finding the position of the first occurrence of ": " + colon_position = text.find(": ") + + # Splitting the string and wrapping the first part in tags + if colon_position != -1: + # Everything before the colon (inclusive) is wrapped in tags + if only_markdown: + if reference_link: + transformed_string = f"[**{text[:colon_position + 1]}**]({reference_link})\n" + text[colon_position + 1:] + else: + transformed_string = f"**{text[:colon_position + 1]}**\n" + text[colon_position + 1:] + else: + if reference_link: + transformed_string = f"{text[:colon_position + 1]}
" + text[colon_position + 1:] + else: + transformed_string = "" + text[:colon_position + 1] + "" +'
' + text[colon_position + 1:] + else: + # If there's no ": ", return the original string + transformed_string = text + + return transformed_string + except Exception as e: + get_logger().exception(f"Failed to emphasize header: {e}") + return text + + +def unique_strings(input_list: List[str]) -> List[str]: + if not input_list or not isinstance(input_list, list): + return input_list + seen = set() + unique_list = [] + for item in input_list: + if item not in seen: + unique_list.append(item) + seen.add(item) + return unique_list + +def convert_to_markdown_v2(output_data: dict, + gfm_supported: bool = True, + incremental_review=None, + git_provider=None) -> str: """ Convert a dictionary of data into markdown format. Args: output_data (dict): A dictionary containing data to be converted to markdown format. Returns: str: The markdown formatted text generated from the input dictionary. - """ - markdown_text = "" + """ emojis = { - "Main theme": "๐ŸŽฏ", - "Type of PR": "๐Ÿ“Œ", + "Can be split": "๐Ÿ”€", + "Possible issues": "โšก", + "Key issues to review": "โšก", "Score": "๐Ÿ…", - "Relevant tests added": "๐Ÿงช", - "Unrelated changes": "โš ๏ธ", + "Relevant tests": "๐Ÿงช", "Focused PR": "โœจ", + "Relevant ticket": "๐ŸŽซ", "Security concerns": "๐Ÿ”’", - "General PR suggestions": "๐Ÿ’ก", "Insights from user's answers": "๐Ÿ“", "Code feedback": "๐Ÿค–", + "Estimated effort to review [1-5]": "โฑ๏ธ", } + markdown_text = "" + if not incremental_review: + markdown_text += f"{PRReviewHeader.REGULAR.value} ๐Ÿ”\n\n" + else: + markdown_text += f"{PRReviewHeader.INCREMENTAL.value} ๐Ÿ”\n\n" + markdown_text += f"โฎ๏ธ Review for commits since previous PR-Agent review {incremental_review}.\n\n" + if not output_data or not output_data.get('review', {}): + return "" + + if gfm_supported: + markdown_text += "\n" + + for key, value in output_data['review'].items(): + if value is None or value == '' or value == {} or value == []: + if key.lower() not in ['can_be_split', 'key_issues_to_review']: + continue + key_nice = key.replace('_', ' ').capitalize() + emoji = emojis.get(key_nice, "") + if 'Estimated effort to review' in key_nice: + key_nice = 'Estimated effort to review' + value = str(value).strip() + if value.isnumeric(): + value_int = int(value) + else: + try: + value_int = int(value.split(',')[0]) + except ValueError: + continue + blue_bars = '๐Ÿ”ต' * value_int + white_bars = 'โšช' * (5 - value_int) + value = f"{value_int} {blue_bars}{white_bars}" + if gfm_supported: + markdown_text += f"\n" + else: + markdown_text += f"### {emoji} {key_nice}: {value}\n\n" + elif 'relevant tests' in key_nice.lower(): + value = str(value).strip().lower() + if gfm_supported: + markdown_text += f"\n" + else: + if is_value_no(value): + markdown_text += f'### {emoji} No relevant tests\n\n' + else: + markdown_text += f"### PR contains tests\n\n" + elif 'security concerns' in key_nice.lower(): + if gfm_supported: + markdown_text += f"\n" + else: + if is_value_no(value): + markdown_text += f'### {emoji} No security concerns identified\n\n' + else: + markdown_text += f"### {emoji} Security concerns\n\n" + value = emphasize_header(value.strip(), only_markdown=True) + markdown_text += f"{value}\n\n" + elif 'can be split' in key_nice.lower(): + if gfm_supported: + markdown_text += f"\n" + elif 'key issues to review' in key_nice.lower(): + # value is a list of issues + if is_value_no(value): + if gfm_supported: + markdown_text += f"\n" + else: + markdown_text += f"### {emoji} No key issues to review\n\n" + else: + # issues = value.split('\n- ') + issues =value + # for i, _ in enumerate(issues): + # issues[i] = issues[i].strip().strip('-').strip() + if gfm_supported: + markdown_text += f"\n" + else: + if gfm_supported: + markdown_text += f"\n" + else: + markdown_text += f"### {emoji} {key_nice}: {value}\n\n" + + if gfm_supported: + markdown_text += "
" + markdown_text += f"{emoji} {key_nice}: {value}" + markdown_text += f"
" + if is_value_no(value): + markdown_text += f"{emoji} No relevant tests" + else: + markdown_text += f"{emoji} PR contains tests" + markdown_text += f"
" + if is_value_no(value): + markdown_text += f"{emoji} No security concerns identified" + else: + markdown_text += f"{emoji} Security concerns

\n\n" + value = emphasize_header(value.strip()) + markdown_text += f"{value}" + markdown_text += f"
" + markdown_text += process_can_be_split(emoji, value) + markdown_text += f"
" + markdown_text += f"{emoji} No key issues to review" + markdown_text += f"
" + markdown_text += f"{emoji} {key_nice}

\n\n" + else: + markdown_text += f"### {emoji} Key issues to review\n\n#### \n" + for i, issue in enumerate(issues): + try: + if not issue: + continue + relevant_file = issue.get('relevant_file', '').strip() + issue_header = issue.get('issue_header', '').strip() + issue_content = issue.get('issue_content', '').strip() + start_line = int(str(issue.get('start_line', 0)).strip()) + end_line = int(str(issue.get('end_line', 0)).strip()) + reference_link = git_provider.get_line_link(relevant_file, start_line, end_line) + + if gfm_supported: + issue_str = f"{issue_header}
{issue_content}" + else: + issue_str = f"[**{issue_header}**]({reference_link})\n\n{issue_content}\n\n" + markdown_text += f"{issue_str}\n\n" + except Exception as e: + get_logger().exception(f"Failed to process key issues to review: {e}") + if gfm_supported: + markdown_text += f"
" + markdown_text += f"{emoji} {key_nice}: {value}" + markdown_text += f"
\n" + + if 'code_feedback' in output_data: + if gfm_supported: + markdown_text += f"\n\n" + markdown_text += f"
Code feedback:\n\n" + markdown_text += "
" + else: + markdown_text += f"\n\n### Code feedback:\n\n" + for i, value in enumerate(output_data['code_feedback']): + if value is None or value == '' or value == {} or value == []: + continue + markdown_text += parse_code_suggestion(value, i, gfm_supported)+"\n\n" + if markdown_text.endswith('
'): + markdown_text= markdown_text[:-4] + if gfm_supported: + markdown_text += f"
" - for key, value in output_data.items(): - if not value: - continue - if isinstance(value, dict): - markdown_text += f"## {key}\n\n" - markdown_text += convert_to_markdown(value) - elif isinstance(value, list): - if key.lower() == 'code feedback': - markdown_text += "\n" # just looks nicer with additional line breaks - emoji = emojis.get(key, "") - markdown_text += f"- {emoji} **{key}:**\n\n" - for item in value: - if isinstance(item, dict) and key.lower() == 'code feedback': - markdown_text += parse_code_suggestion(item) - elif item: - markdown_text += f" - {item}\n" - elif value != 'n/a': - emoji = emojis.get(key, "") - markdown_text += f"- {emoji} **{key}:** {value}\n" return markdown_text -def parse_code_suggestion(code_suggestions: dict) -> str: +def process_can_be_split(emoji, value): + try: + # key_nice = "Can this PR be split?" + key_nice = "Multiple PR themes" + markdown_text = "" + if not value or isinstance(value, list) and len(value) == 1: + value = "No" + # markdown_text += f" {emoji} {key_nice}\n\n{value}\n\n\n" + # markdown_text += f"### {emoji} No multiple PR themes\n\n" + markdown_text += f"{emoji} No multiple PR themes\n\n" + else: + markdown_text += f"{emoji} {key_nice}

\n\n" + for i, split in enumerate(value): + title = split.get('title', '') + relevant_files = split.get('relevant_files', []) + markdown_text += f"
\nSub-PR theme: {title}\n\n" + markdown_text += f"___\n\nRelevant files:\n\n" + for file in relevant_files: + markdown_text += f"- {file}\n" + markdown_text += f"___\n\n" + markdown_text += f"
\n\n" + + # markdown_text += f"#### Sub-PR theme: {title}\n\n" + # markdown_text += f"Relevant files:\n\n" + # for file in relevant_files: + # markdown_text += f"- {file}\n" + # markdown_text += "\n" + # number_of_splits = len(value) + # markdown_text += f" {emoji} {key_nice}\n" + # for i, split in enumerate(value): + # title = split.get('title', '') + # relevant_files = split.get('relevant_files', []) + # if i == 0: + # markdown_text += f"
\nSub-PR theme:
{title}
\n\n" + # markdown_text += f"
\n" + # markdown_text += f"Relevant files:\n" + # markdown_text += f"
    \n" + # for file in relevant_files: + # markdown_text += f"
  • {file}
  • \n" + # markdown_text += f"
\n\n
\n" + # else: + # markdown_text += f"\n
\nSub-PR theme:
{title}
\n\n" + # markdown_text += f"
\n" + # markdown_text += f"Relevant files:\n" + # markdown_text += f"
    \n" + # for file in relevant_files: + # markdown_text += f"
  • {file}
  • \n" + # markdown_text += f"
\n\n
\n" + except Exception as e: + get_logger().exception(f"Failed to process can be split: {e}") + return "" + return markdown_text + + +def parse_code_suggestion(code_suggestion: dict, i: int = 0, gfm_supported: bool = True) -> str: """ Convert a dictionary of data into markdown format. Args: - code_suggestions (dict): A dictionary containing data to be converted to markdown format. + code_suggestion (dict): A dictionary containing data to be converted to markdown format. Returns: str: A string containing the markdown formatted text generated from the input dictionary. """ markdown_text = "" - for sub_key, sub_value in code_suggestions.items(): - if isinstance(sub_value, dict): # "code example" - markdown_text += f" - **{sub_key}:**\n" - for code_key, code_value in sub_value.items(): # 'before' and 'after' code - code_str = f"```\n{code_value}\n```" - code_str_indented = textwrap.indent(code_str, ' ') - markdown_text += f" - **{code_key}:**\n{code_str_indented}\n" - else: - if "relevant file" in sub_key.lower(): - markdown_text += f"\n - **{sub_key}:** {sub_value}\n" + if gfm_supported and 'relevant_line' in code_suggestion: + markdown_text += '' + for sub_key, sub_value in code_suggestion.items(): + try: + if sub_key.lower() == 'relevant_file': + relevant_file = sub_value.strip('`').strip('"').strip("'") + markdown_text += f"" + # continue + elif sub_key.lower() == 'suggestion': + markdown_text += (f"" + f"") + elif sub_key.lower() == 'relevant_line': + markdown_text += f"" + sub_value_list = sub_value.split('](') + relevant_line = sub_value_list[0].lstrip('`').lstrip('[') + if len(sub_value_list) > 1: + link = sub_value_list[1].rstrip(')').strip('`') + markdown_text += f"" + else: + markdown_text += f"" + markdown_text += "" + except Exception as e: + get_logger().exception(f"Failed to parse code suggestion: {e}") + pass + markdown_text += '
relevant file{relevant_file}
{sub_key}      \n\n\n\n{sub_value.strip()}\n\n\n
relevant line{relevant_line}{relevant_line}
' + markdown_text += "
" + else: + for sub_key, sub_value in code_suggestion.items(): + if isinstance(sub_key, str): + sub_key = sub_key.rstrip() + if isinstance(sub_value,str): + sub_value = sub_value.rstrip() + if isinstance(sub_value, dict): # "code example" + markdown_text += f" - **{sub_key}:**\n" + for code_key, code_value in sub_value.items(): # 'before' and 'after' code + code_str = f"```\n{code_value}\n```" + code_str_indented = textwrap.indent(code_str, ' ') + markdown_text += f" - **{code_key}:**\n{code_str_indented}\n" else: - markdown_text += f" **{sub_key}:** {sub_value}\n" - - markdown_text += "\n" + if "relevant_file" in sub_key.lower(): + markdown_text += f"\n - **{sub_key}:** {sub_value} \n" + else: + markdown_text += f" **{sub_key}:** {sub_value} \n" + if "relevant_line" not in sub_key.lower(): # nicer presentation + # markdown_text = markdown_text.rstrip('\n') + "\\\n" # works for gitlab + markdown_text = markdown_text.rstrip('\n') + " \n" # works for gitlab and bitbucker + + markdown_text += "\n" return markdown_text @@ -145,7 +421,7 @@ def try_fix_json(review, max_iter=10, code_suggestions=False): iter_count += 1 if not valid_json: - logging.error("Unable to decode JSON response from AI") + get_logger().error("Unable to decode JSON response from AI") data = {} return data @@ -164,7 +440,7 @@ def fix_json_escape_char(json_message=None): Raises: None - """ + """ try: result = json.loads(json_message) except Exception as e: @@ -191,12 +467,12 @@ def convert_str_to_datetime(date_str): Example: >>> convert_str_to_datetime('Mon, 01 Jan 2022 12:00:00 UTC') datetime.datetime(2022, 1, 1, 12, 0, 0) - """ + """ datetime_format = '%a, %d %b %Y %H:%M:%S %Z' return datetime.strptime(date_str, datetime_format) -def load_large_diff(filename, new_file_content_str: str, original_file_content_str: str) -> str: +def load_large_diff(filename, new_file_content_str: str, original_file_content_str: str, show_warning: bool = True) -> str: """ Generate a patch for a modified file by comparing the original content of the file with the new content provided as input. @@ -215,8 +491,8 @@ def load_large_diff(filename, new_file_content_str: str, original_file_content_s try: diff = difflib.unified_diff(original_file_content_str.splitlines(keepends=True), new_file_content_str.splitlines(keepends=True)) - if get_settings().config.verbosity_level >= 2: - logging.warning(f"File was modified, but no patch was found. Manually creating patch: {filename}.") + if get_settings().config.verbosity_level >= 2 and show_warning: + get_logger().warning(f"File was modified, but no patch was found. Manually creating patch: {filename}.") patch = ''.join(diff) except Exception: pass @@ -245,39 +521,488 @@ def update_settings_from_args(args: List[str]) -> List[str]: arg = arg.strip() if arg.startswith('--'): arg = arg.strip('-').strip() - vals = arg.split('=') + vals = arg.split('=', 1) if len(vals) != 2: - logging.error(f'Invalid argument format: {arg}') + if len(vals) > 2: # --extended is a valid argument + get_logger().error(f'Invalid argument format: {arg}') other_args.append(arg) continue - key, value = vals - key = key.strip().upper() - value = value.strip() + key, value = _fix_key_value(*vals) get_settings().set(key, value) - logging.info(f'Updated setting {key} to: "{value}"') + get_logger().info(f'Updated setting {key} to: "{value}"') else: other_args.append(arg) return other_args -def load_yaml(review_text: str) -> dict: - review_text = review_text.removeprefix('```yaml').rstrip('`') +def _fix_key_value(key: str, value: str): + key = key.strip().upper() + value = value.strip() + try: + value = yaml.safe_load(value) + except Exception as e: + get_logger().debug(f"Failed to parse YAML for config override {key}={value}", exc_info=e) + return key, value + + +def load_yaml(response_text: str, keys_fix_yaml: List[str] = [], first_key="", last_key="") -> dict: + response_text = response_text.strip('\n').removeprefix('```yaml').rstrip().removesuffix('```') try: - data = yaml.load(review_text, Loader=yaml.SafeLoader) + data = yaml.safe_load(response_text) except Exception as e: - logging.error(f"Failed to parse AI prediction: {e}") - data = try_fix_yaml(review_text) + get_logger().warning(f"Initial failure to parse AI prediction: {e}") + data = try_fix_yaml(response_text, keys_fix_yaml=keys_fix_yaml, first_key=first_key, last_key=last_key) + if not data: + get_logger().error(f"Failed to parse AI prediction after fallbacks", artifact={'response_text': response_text}) + else: + get_logger().info(f"Successfully parsed AI prediction after fallbacks", + artifact={'response_text': response_text}) return data -def try_fix_yaml(review_text: str) -> dict: - review_text_lines = review_text.split('\n') + + +def try_fix_yaml(response_text: str, + keys_fix_yaml: List[str] = [], + first_key="", + last_key="",) -> dict: + response_text_lines = response_text.split('\n') + + keys_yaml = ['relevant line:', 'suggestion content:', 'relevant file:', 'existing code:', 'improved code:'] + keys_yaml = keys_yaml + keys_fix_yaml + # first fallback - try to convert 'relevant line: ...' to relevant line: |-\n ...' + response_text_lines_copy = response_text_lines.copy() + for i in range(0, len(response_text_lines_copy)): + for key in keys_yaml: + if key in response_text_lines_copy[i] and not '|' in response_text_lines_copy[i]: + response_text_lines_copy[i] = response_text_lines_copy[i].replace(f'{key}', + f'{key} |\n ') + try: + data = yaml.safe_load('\n'.join(response_text_lines_copy)) + get_logger().info(f"Successfully parsed AI prediction after adding |-\n") + return data + except: + get_logger().info(f"Failed to parse AI prediction after adding |-\n") + + # second fallback - try to extract only range from first ```yaml to ```` + snippet_pattern = r'```(yaml)?[\s\S]*?```' + snippet = re.search(snippet_pattern, '\n'.join(response_text_lines_copy)) + if snippet: + snippet_text = snippet.group() + try: + data = yaml.safe_load(snippet_text.removeprefix('```yaml').rstrip('`')) + get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet") + return data + except: + pass + + + # third fallback - try to remove leading and trailing curly brackets + response_text_copy = response_text.strip().rstrip().removeprefix('{').removesuffix('}').rstrip(':\n') + try: + data = yaml.safe_load(response_text_copy) + get_logger().info(f"Successfully parsed AI prediction after removing curly brackets") + return data + except: + pass + + + # forth fallback - try to extract yaml snippet by 'first_key' and 'last_key' + # note that 'last_key' can be in practice a key that is not the last key in the yaml snippet. + # it just needs to be some inner key, so we can look for newlines after it + if first_key and last_key: + index_start = response_text.find(f"\n{first_key}:") + if index_start == -1: + index_start = response_text.find(f"{first_key}:") + index_last_code = response_text.rfind(f"{last_key}:") + index_end = response_text.find("\n\n", index_last_code) # look for newlines after last_key + if index_end == -1: + index_end = len(response_text) + response_text_copy = response_text[index_start:index_end].strip().strip('```yaml').strip('`').strip() + try: + data = yaml.safe_load(response_text_copy) + get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet") + return data + except: + pass + + + # fifth fallback - try to remove last lines data = {} - for i in range(1, len(review_text_lines)): - review_text_lines_tmp = '\n'.join(review_text_lines[:-i]) + for i in range(1, len(response_text_lines)): + response_text_lines_tmp = '\n'.join(response_text_lines[:-i]) try: - data = yaml.load(review_text_lines_tmp, Loader=yaml.SafeLoader) - logging.info(f"Successfully parsed AI prediction after removing {i} lines") - break + data = yaml.safe_load(response_text_lines_tmp) + get_logger().info(f"Successfully parsed AI prediction after removing {i} lines") + return data except: pass - return data + + +def set_custom_labels(variables, git_provider=None): + if not get_settings().config.enable_custom_labels: + return + + labels = get_settings().get('custom_labels', {}) + if not labels: + # set default labels + labels = ['Bug fix', 'Tests', 'Bug fix with tests', 'Enhancement', 'Documentation', 'Other'] + labels_list = "\n - ".join(labels) if labels else "" + labels_list = f" - {labels_list}" if labels_list else "" + variables["custom_labels"] = labels_list + return + + # Set custom labels + variables["custom_labels_class"] = "class Label(str, Enum):" + counter = 0 + labels_minimal_to_labels_dict = {} + for k, v in labels.items(): + description = "'" + v['description'].strip('\n').replace('\n', '\\n') + "'" + # variables["custom_labels_class"] += f"\n {k.lower().replace(' ', '_')} = '{k}' # {description}" + variables["custom_labels_class"] += f"\n {k.lower().replace(' ', '_')} = {description}" + labels_minimal_to_labels_dict[k.lower().replace(' ', '_')] = k + counter += 1 + variables["labels_minimal_to_labels_dict"] = labels_minimal_to_labels_dict + +def get_user_labels(current_labels: List[str] = None): + """ + Only keep labels that has been added by the user + """ + try: + enable_custom_labels = get_settings().config.get('enable_custom_labels', False) + custom_labels = get_settings().get('custom_labels', []) + if current_labels is None: + current_labels = [] + user_labels = [] + for label in current_labels: + if label.lower() in ['bug fix', 'tests', 'enhancement', 'documentation', 'other']: + continue + if enable_custom_labels: + if label in custom_labels: + continue + user_labels.append(label) + if user_labels: + get_logger().debug(f"Keeping user labels: {user_labels}") + except Exception as e: + get_logger().exception(f"Failed to get user labels: {e}") + return current_labels + return user_labels + + +def get_max_tokens(model): + """ + Get the maximum number of tokens allowed for a model. + logic: + (1) If the model is in './pr_agent/algo/__init__.py', use the value from there. + (2) else, the user needs to define explicitly 'config.custom_model_max_tokens' + + For both cases, we further limit the number of tokens to 'config.max_model_tokens' if it is set. + This aims to improve the algorithmic quality, as the AI model degrades in performance when the input is too long. + """ + settings = get_settings() + if model in MAX_TOKENS: + max_tokens_model = MAX_TOKENS[model] + elif settings.config.custom_model_max_tokens > 0: + max_tokens_model = settings.config.custom_model_max_tokens + else: + raise Exception(f"Ensure {model} is defined in MAX_TOKENS in ./pr_agent/algo/__init__.py or set a positive value for it in config.custom_model_max_tokens") + + if settings.config.max_model_tokens and settings.config.max_model_tokens > 0: + max_tokens_model = min(settings.config.max_model_tokens, max_tokens_model) + return max_tokens_model + + +def clip_tokens(text: str, max_tokens: int, add_three_dots=True, num_input_tokens=None, delete_last_line=False) -> str: + """ + Clip the number of tokens in a string to a maximum number of tokens. + + Args: + text (str): The string to clip. + max_tokens (int): The maximum number of tokens allowed in the string. + add_three_dots (bool, optional): A boolean indicating whether to add three dots at the end of the clipped + Returns: + str: The clipped string. + """ + if not text: + return text + + try: + if num_input_tokens is None: + encoder = TokenEncoder.get_token_encoder() + num_input_tokens = len(encoder.encode(text)) + if num_input_tokens <= max_tokens: + return text + if max_tokens < 0: + return "" + + # calculate the number of characters to keep + num_chars = len(text) + chars_per_token = num_chars / num_input_tokens + factor = 0.9 # reduce by 10% to be safe + num_output_chars = int(factor * chars_per_token * max_tokens) + + # clip the text + if num_output_chars > 0: + clipped_text = text[:num_output_chars] + if delete_last_line: + clipped_text = clipped_text.rsplit('\n', 1)[0] + if add_three_dots: + clipped_text += "\n...(truncated)" + else: # if the text is empty + clipped_text = "" + + return clipped_text + except Exception as e: + get_logger().warning(f"Failed to clip tokens: {e}") + return text + +def replace_code_tags(text): + """ + Replace odd instances of ` with and even instances of ` with + """ + text = html.escape(text) + parts = text.split('`') + for i in range(1, len(parts), 2): + parts[i] = '' + parts[i] + '' + return ''.join(parts) + + +def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo], + relevant_file: str, + relevant_line_in_file: str, + absolute_position: int = None) -> Tuple[int, int]: + position = -1 + if absolute_position is None: + absolute_position = -1 + re_hunk_header = re.compile( + r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") + + if not diff_files: + return position, absolute_position + + for file in diff_files: + if file.filename and (file.filename.strip() == relevant_file): + patch = file.patch + patch_lines = patch.splitlines() + delta = 0 + start1, size1, start2, size2 = 0, 0, 0, 0 + if absolute_position != -1: # matching absolute to relative + for i, line in enumerate(patch_lines): + # new hunk + if line.startswith('@@'): + delta = 0 + match = re_hunk_header.match(line) + start1, size1, start2, size2 = map(int, match.groups()[:4]) + elif not line.startswith('-'): + delta += 1 + + # + absolute_position_curr = start2 + delta - 1 + + if absolute_position_curr == absolute_position: + position = i + break + else: + # try to find the line in the patch using difflib, with some margin of error + matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file, + patch_lines, n=3, cutoff=0.93) + if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'): + relevant_line_in_file = matches_difflib[0] + + + for i, line in enumerate(patch_lines): + if line.startswith('@@'): + delta = 0 + match = re_hunk_header.match(line) + start1, size1, start2, size2 = map(int, match.groups()[:4]) + elif not line.startswith('-'): + delta += 1 + + if relevant_line_in_file in line and line[0] != '-': + position = i + absolute_position = start2 + delta - 1 + break + + if position == -1 and relevant_line_in_file[0] == '+': + no_plus_line = relevant_line_in_file[1:].lstrip() + for i, line in enumerate(patch_lines): + if line.startswith('@@'): + delta = 0 + match = re_hunk_header.match(line) + start1, size1, start2, size2 = map(int, match.groups()[:4]) + elif not line.startswith('-'): + delta += 1 + + if no_plus_line in line and line[0] != '-': + # The model might add a '+' to the beginning of the relevant_line_in_file even if originally + # it's a context line + position = i + absolute_position = start2 + delta - 1 + break + return position, absolute_position + +def validate_and_await_rate_limit(rate_limit_status=None, git_provider=None, get_rate_limit_status_func=None): + if git_provider and not rate_limit_status: + rate_limit_status = {'resources': git_provider.github_client.get_rate_limit().raw_data} + + if not rate_limit_status: + rate_limit_status = get_rate_limit_status_func() + # validate that the rate limit is not exceeded + is_rate_limit = False + for key, value in rate_limit_status['resources'].items(): + if value['remaining'] == 0: + print(f"key: {key}, value: {value}") + is_rate_limit = True + sleep_time_sec = value['reset'] - datetime.now().timestamp() + sleep_time_hour = sleep_time_sec / 3600.0 + print(f"Rate limit exceeded. Sleeping for {sleep_time_hour} hours") + if sleep_time_sec > 0: + time.sleep(sleep_time_sec+1) + + if git_provider: + rate_limit_status = {'resources': git_provider.github_client.get_rate_limit().raw_data} + else: + rate_limit_status = get_rate_limit_status_func() + + return is_rate_limit + + +def get_largest_component(pr_url): + from pr_agent.tools.pr_analyzer import PRAnalyzer + publish_output = get_settings().config.publish_output + get_settings().config.publish_output = False # disable publish output + analyzer = PRAnalyzer(pr_url) + methods_dict_files = analyzer.run_sync() + get_settings().config.publish_output = publish_output + max_lines_changed = 0 + file_b = "" + component_name_b = "" + for file in methods_dict_files: + for method in methods_dict_files[file]: + try: + if methods_dict_files[file][method]['num_plus_lines'] > max_lines_changed: + max_lines_changed = methods_dict_files[file][method]['num_plus_lines'] + file_b = file + component_name_b = method + except: + pass + if component_name_b: + get_logger().info(f"Using the largest changed component: '{component_name_b}'") + return component_name_b, file_b + else: + return None, None + +def github_action_output(output_data: dict, key_name: str): + try: + if not get_settings().get('github_action_config.enable_output', False): + return + + key_data = output_data.get(key_name, {}) + with open(os.environ['GITHUB_OUTPUT'], 'a') as fh: + print(f"{key_name}={json.dumps(key_data, indent=None, ensure_ascii=False)}", file=fh) + except Exception as e: + get_logger().error(f"Failed to write to GitHub Action output: {e}") + return + + +def show_relevant_configurations(relevant_section: str) -> str: + skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", + 'trial_prefix_message', 'no_eligible_message', 'identity_provider', 'ALLOWED_REPOS','APP_NAME'] + extra_skip_keys = get_settings().config.get('config.skip_keys', []) + if extra_skip_keys: + skip_keys.extend(extra_skip_keys) + + markdown_text = "" + markdown_text += "\n
\n
๐Ÿ› ๏ธ Relevant configurations: \n\n" + markdown_text +="
These are the relevant [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) for this tool:\n\n" + markdown_text += f"**[config**]\n```yaml\n\n" + for key, value in get_settings().config.items(): + if key in skip_keys: + continue + markdown_text += f"{key}: {value}\n" + markdown_text += "\n```\n" + markdown_text += f"\n**[{relevant_section}]**\n```yaml\n\n" + for key, value in get_settings().get(relevant_section, {}).items(): + if key in skip_keys: + continue + markdown_text += f"{key}: {value}\n" + markdown_text += "\n```" + markdown_text += "\n
\n" + return markdown_text + +def is_value_no(value): + if not value: + return True + value_str = str(value).strip().lower() + if value_str == 'no' or value_str == 'none' or value_str == 'false': + return True + return False + + +def process_description(description_full: str) -> Tuple[str, List]: + if not description_full: + return "", [] + + split_str = "### **Changes walkthrough** ๐Ÿ“" + description_split = description_full.split(split_str) + base_description_str = description_split[0] + changes_walkthrough_str = "" + files = [] + if len(description_split) > 1: + changes_walkthrough_str = description_split[1] + else: + get_logger().debug("No changes walkthrough found") + + try: + if changes_walkthrough_str: + # get the end of the table + if '\n\n___' in changes_walkthrough_str: + end = changes_walkthrough_str.index("\n\n___") + elif '\n___' in changes_walkthrough_str: + end = changes_walkthrough_str.index("\n___") + else: + end = len(changes_walkthrough_str) + changes_walkthrough_str = changes_walkthrough_str[:end] + + h = html2text.HTML2Text() + h.body_width = 0 # Disable line wrapping + + # find all the files + pattern = r'\s*\s*(
\s*(.*?)(.*?)
)\s*' + files_found = re.findall(pattern, changes_walkthrough_str, re.DOTALL) + for file_data in files_found: + try: + if isinstance(file_data, tuple): + file_data = file_data[0] + pattern = r'
\s*(.*?)\s*
(.*?).*?
\s*
\s*(.*?)\s*
  • (.*?)
  • ' + res = re.search(pattern, file_data, re.DOTALL) + if not res or res.lastindex != 4: + pattern_back = r'
    \s*(.*?)
    (.*?).*?
    \s*
    \s*(.*?)\n\n\s*(.*?)
    ' + res = re.search(pattern_back, file_data, re.DOTALL) + if res and res.lastindex == 4: + short_filename = res.group(1).strip() + short_summary = res.group(2).strip() + long_filename = res.group(3).strip() + long_summary = res.group(4).strip() + long_summary = long_summary.replace('
    *', '\n*').replace('
    ','').replace('\n','
    ') + long_summary = h.handle(long_summary).strip() + if long_summary.startswith('\\-'): + long_summary = "* " + long_summary[2:] + elif not long_summary.startswith('*'): + long_summary = f"* {long_summary}" + + files.append({ + 'short_file_name': short_filename, + 'full_file_name': long_filename, + 'short_summary': short_summary, + 'long_summary': long_summary + }) + else: + get_logger().error(f"Failed to parse description", artifact={'description': file_data}) + except Exception as e: + get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data}) + + + except Exception as e: + get_logger().exception(f"Failed to process description: {e}") + + return base_description_str, files diff --git a/pr_agent/cli.py b/pr_agent/cli.py index 0f8710419..98b493ded 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -1,42 +1,91 @@ import argparse import asyncio -import logging import os from pr_agent.agent.pr_agent import PRAgent, commands from pr_agent.config_loader import get_settings +from pr_agent.log import setup_logger, get_logger +log_level = os.environ.get("LOG_LEVEL", "INFO") +setup_logger(log_level) -def run(inargs=None): + +def set_parser(): parser = argparse.ArgumentParser(description='AI based pull request analyzer', usage= -"""\ -Usage: cli.py --pr-url= []. -For example: -- cli.py --pr_url=... review -- cli.py --pr_url=... describe -- cli.py --pr_url=... improve -- cli.py --pr_url=... ask "write me a poem about this PR" -- cli.py --pr_url=... reflect - -Supported commands: -review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement. -ask / ask_question [question] - Ask a question about the PR. -describe / describe_pr - Modify the PR title and description based on the PR's contents. -improve / improve_code - Suggest improvements to the code in the PR as pull request comments ready to commit. -reflect - Ask the PR author questions about the PR. -update_changelog - Update the changelog based on the PR's contents. - -To edit any configuration parameter from 'configuration.toml', just add -config_path=. -For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."' -""") - parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True) + """\ + Usage: cli.py --pr-url= []. + For example: + - cli.py --pr_url=... review + - cli.py --pr_url=... describe + - cli.py --pr_url=... improve + - cli.py --pr_url=... ask "write me a poem about this PR" + - cli.py --pr_url=... reflect + - cli.py --issue_url=... similar_issue + + Supported commands: + - review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement. + + - ask / ask_question [question] - Ask a question about the PR. + + - describe / describe_pr - Modify the PR title and description based on the PR's contents. + + - improve / improve_code - Suggest improvements to the code in the PR as pull request comments ready to commit. + Extended mode ('improve --extended') employs several calls, and provides a more thorough feedback + + - reflect - Ask the PR author questions about the PR. + + - update_changelog - Update the changelog based on the PR's contents. + + - add_docs + + - generate_labels + + + Configuration: + To edit any configuration parameter from 'configuration.toml', just add -config_path=. + For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."' + """) + parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', default=None) + parser.add_argument('--issue_url', type=str, help='The URL of the Issue to review', default=None) parser.add_argument('command', type=str, help='The', choices=commands, default='review') parser.add_argument('rest', nargs=argparse.REMAINDER, default=[]) - args = parser.parse_args(inargs) - logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) + return parser + + +def run_command(pr_url, command): + # Preparing the command + run_command_str = f"--pr_url={pr_url} {command.lstrip('/')}" + args = set_parser().parse_args(run_command_str.split()) + + # Run the command. Feedback will appear in GitHub PR comments + run(args=args) + + +def run(inargs=None, args=None): + parser = set_parser() + if not args: + args = parser.parse_args(inargs) + if not args.pr_url and not args.issue_url: + parser.print_help() + return + command = args.command.lower() get_settings().set("CONFIG.CLI_MODE", True) - result = asyncio.run(PRAgent().handle_request(args.pr_url, command + " " + " ".join(args.rest))) + + async def inner(): + if args.issue_url: + result = await asyncio.create_task(PRAgent().handle_request(args.issue_url, [command] + args.rest)) + else: + result = await asyncio.create_task(PRAgent().handle_request(args.pr_url, [command] + args.rest)) + + if get_settings().litellm.get("enable_callbacks", False): + # There may be additional events on the event queue from the run above. If there are give them time to complete. + get_logger().debug("Waiting for event queue to complete") + await asyncio.wait([task for task in asyncio.all_tasks() if task is not asyncio.current_task()]) + + return result + + result = asyncio.run(inner()) if not result: parser.print_help() diff --git a/pr_agent/cli_pip.py b/pr_agent/cli_pip.py new file mode 100644 index 000000000..caa56f0c9 --- /dev/null +++ b/pr_agent/cli_pip.py @@ -0,0 +1,23 @@ +from pr_agent import cli +from pr_agent.config_loader import get_settings + + +def main(): + # Fill in the following values + provider = "github" # GitHub provider + user_token = "..." # GitHub user token + openai_key = "..." # OpenAI key + pr_url = "..." # PR URL, for example 'https://github.com/Codium-ai/pr-agent/pull/809' + command = "/review" # Command to run (e.g. '/review', '/describe', '/ask="What is the purpose of this PR?"') + + # Setting the configurations + get_settings().set("CONFIG.git_provider", provider) + get_settings().set("openai.key", openai_key) + get_settings().set("github.user_token", user_token) + + # Run the command. Feedback will appear in GitHub PR comments + cli.run_command(pr_url, command) + + +if __name__ == '__main__': + main() diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index 3075e8dcd..b13a3ce74 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -14,19 +14,36 @@ settings_files=[join(current_dir, f) for f in [ "settings/.secrets.toml", "settings/configuration.toml", + "settings/ignore.toml", "settings/language_extensions.toml", "settings/pr_reviewer_prompts.toml", "settings/pr_questions_prompts.toml", + "settings/pr_line_questions_prompts.toml", "settings/pr_description_prompts.toml", "settings/pr_code_suggestions_prompts.toml", + "settings/pr_code_suggestions_reflect_prompts.toml", + "settings/pr_sort_code_suggestions_prompts.toml", "settings/pr_information_from_user_prompts.toml", "settings/pr_update_changelog_prompts.toml", - "settings_prod/.secrets.toml" + "settings/pr_custom_labels.toml", + "settings/pr_add_docs.toml", + "settings/custom_labels.toml", + "settings/pr_help_prompts.toml", + "settings_prod/.secrets.toml", ]] ) def get_settings(): + """ + Retrieves the current settings. + + This function attempts to fetch the settings from the starlette_context's context object. If it fails, + it defaults to the global settings defined outside of this function. + + Returns: + Dynaconf: The current settings object, either from the context or the global default. + """ try: return context["settings"] except Exception: @@ -34,7 +51,7 @@ def get_settings(): # Add local configuration from pyproject.toml of the project being reviewed -def _find_repository_root() -> Path: +def _find_repository_root() -> Optional[Path]: """ Identify project root directory by recursively searching for the .git directory in the parent directories. """ @@ -54,7 +71,7 @@ def _find_pyproject() -> Optional[Path]: """ repo_root = _find_repository_root() if repo_root: - pyproject = _find_repository_root() / "pyproject.toml" + pyproject = repo_root / "pyproject.toml" return pyproject if pyproject.is_file() else None return None diff --git a/pr_agent/git_providers/__init__.py b/pr_agent/git_providers/__init__.py index e7c2aa0f0..c7e3e6e81 100644 --- a/pr_agent/git_providers/__init__.py +++ b/pr_agent/git_providers/__init__.py @@ -1,16 +1,27 @@ from pr_agent.config_loader import get_settings from pr_agent.git_providers.bitbucket_provider import BitbucketProvider +from pr_agent.git_providers.bitbucket_server_provider import BitbucketServerProvider +from pr_agent.git_providers.codecommit_provider import CodeCommitProvider +from pr_agent.git_providers.git_provider import GitProvider from pr_agent.git_providers.github_provider import GithubProvider from pr_agent.git_providers.gitlab_provider import GitLabProvider from pr_agent.git_providers.local_git_provider import LocalGitProvider +from pr_agent.git_providers.azuredevops_provider import AzureDevopsProvider +from pr_agent.git_providers.gerrit_provider import GerritProvider +from starlette_context import context _GIT_PROVIDERS = { 'github': GithubProvider, 'gitlab': GitLabProvider, 'bitbucket': BitbucketProvider, - 'local' : LocalGitProvider + 'bitbucket_server': BitbucketServerProvider, + 'azure': AzureDevopsProvider, + 'codecommit': CodeCommitProvider, + 'local': LocalGitProvider, + 'gerrit': GerritProvider, } + def get_git_provider(): try: provider_id = get_settings().config.git_provider @@ -19,3 +30,33 @@ def get_git_provider(): if provider_id not in _GIT_PROVIDERS: raise ValueError(f"Unknown git provider: {provider_id}") return _GIT_PROVIDERS[provider_id] + + +def get_git_provider_with_context(pr_url) -> GitProvider: + """ + Get a GitProvider instance for the given PR URL. If the GitProvider instance is already in the context, return it. + """ + + is_context_env = None + try: + is_context_env = context.get("settings", None) + except Exception: + pass # we are not in a context environment (CLI) + + # check if context["git_provider"]["pr_url"] exists + if is_context_env and context.get("git_provider", {}).get("pr_url", {}): + git_provider = context["git_provider"]["pr_url"] + # possibly check if the git_provider is still valid, or if some reset is needed + # ... + return git_provider + else: + try: + provider_id = get_settings().config.git_provider + if provider_id not in _GIT_PROVIDERS: + raise ValueError(f"Unknown git provider: {provider_id}") + git_provider = _GIT_PROVIDERS[provider_id](pr_url) + if is_context_env: + context["git_provider"] = {pr_url: git_provider} + return git_provider + except Exception as e: + raise ValueError(f"Failed to get git provider for {pr_url}") from e diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py new file mode 100644 index 000000000..02390bb0d --- /dev/null +++ b/pr_agent/git_providers/azuredevops_provider.py @@ -0,0 +1,620 @@ +import os +from typing import Optional, Tuple +from urllib.parse import urlparse + +from ..algo.file_filter import filter_ignored +from ..log import get_logger +from ..algo.language_handler import is_valid_file +from ..algo.utils import clip_tokens, find_line_number_of_relevant_line_in_file, load_large_diff +from ..config_loader import get_settings +from .git_provider import GitProvider +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo + +AZURE_DEVOPS_AVAILABLE = True +ADO_APP_CLIENT_DEFAULT_ID = "499b84ac-1321-427f-aa17-267ca6975798/.default" +MAX_PR_DESCRIPTION_AZURE_LENGTH = 4000-1 + +try: + # noinspection PyUnresolvedReferences + from msrest.authentication import BasicAuthentication + # noinspection PyUnresolvedReferences + from azure.devops.connection import Connection + # noinspection PyUnresolvedReferences + from azure.identity import DefaultAzureCredential + # noinspection PyUnresolvedReferences + from azure.devops.v7_1.git.models import ( + Comment, + CommentThread, + GitVersionDescriptor, + GitPullRequest, + GitPullRequestIterationChanges, + ) +except ImportError: + AZURE_DEVOPS_AVAILABLE = False + + +class AzureDevopsProvider(GitProvider): + + def __init__( + self, pr_url: Optional[str] = None, incremental: Optional[bool] = False + ): + if not AZURE_DEVOPS_AVAILABLE: + raise ImportError( + "Azure DevOps provider is not available. Please install the required dependencies." + ) + + self.azure_devops_client = self._get_azure_devops_client() + self.diff_files = None + self.workspace_slug = None + self.repo_slug = None + self.repo = None + self.pr_num = None + self.pr = None + self.temp_comments = [] + self.incremental = incremental + if pr_url: + self.set_pr(pr_url) + + def publish_code_suggestions(self, code_suggestions: list) -> bool: + """ + Publishes code suggestions as comments on the PR. + """ + post_parameters_list = [] + for suggestion in code_suggestions: + body = suggestion['body'] + relevant_file = suggestion['relevant_file'] + relevant_lines_start = suggestion['relevant_lines_start'] + relevant_lines_end = suggestion['relevant_lines_end'] + + if not relevant_lines_start or relevant_lines_start == -1: + if get_settings().config.verbosity_level >= 2: + get_logger().exception( + f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}") + continue + + if relevant_lines_end < relevant_lines_start: + if get_settings().config.verbosity_level >= 2: + get_logger().exception(f"Failed to publish code suggestion, " + f"relevant_lines_end is {relevant_lines_end} and " + f"relevant_lines_start is {relevant_lines_start}") + continue + + if relevant_lines_end > relevant_lines_start: + post_parameters = { + "body": body, + "path": relevant_file, + "line": relevant_lines_end, + "start_line": relevant_lines_start, + "start_side": "RIGHT", + } + else: # API is different for single line comments + post_parameters = { + "body": body, + "path": relevant_file, + "line": relevant_lines_start, + "side": "RIGHT", + } + post_parameters_list.append(post_parameters) + + try: + for post_parameters in post_parameters_list: + comment = Comment(content=post_parameters["body"], comment_type=1) + thread = CommentThread(comments=[comment], + thread_context={ + "filePath": post_parameters["path"], + "rightFileStart": { + "line": post_parameters["start_line"], + "offset": 1, + }, + "rightFileEnd": { + "line": post_parameters["line"], + "offset": 1, + }, + }) + self.azure_devops_client.create_thread( + comment_thread=thread, + project=self.workspace_slug, + repository_id=self.repo_slug, + pull_request_id=self.pr_num + ) + if get_settings().config.verbosity_level >= 2: + get_logger().info( + f"Published code suggestion on {self.pr_num} at {post_parameters['path']}" + ) + return True + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish code suggestion, error: {e}") + return False + + def get_pr_description_full(self) -> str: + return self.pr.description + + def edit_comment(self, comment, body: str): + try: + self.azure_devops_client.update_comment( + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + thread_id=comment["thread_id"], + comment_id=comment["comment_id"], + comment=Comment(content=body), + project=self.workspace_slug, + ) + except Exception as e: + get_logger().exception(f"Failed to edit comment, error: {e}") + + def remove_comment(self, comment): + try: + self.azure_devops_client.delete_comment( + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + thread_id=comment["thread_id"], + comment_id=comment["comment_id"], + project=self.workspace_slug, + ) + except Exception as e: + get_logger().exception(f"Failed to remove comment, error: {e}") + + def publish_labels(self, pr_types): + try: + for pr_type in pr_types: + self.azure_devops_client.create_pull_request_label( + label={"name": pr_type}, + project=self.workspace_slug, + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + ) + except Exception as e: + get_logger().warning(f"Failed to publish labels, error: {e}") + + def get_pr_labels(self, update=False): + try: + labels = self.azure_devops_client.get_pull_request_labels( + project=self.workspace_slug, + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + ) + return [label.name for label in labels] + except Exception as e: + get_logger().exception(f"Failed to get labels, error: {e}") + return [] + + def is_supported(self, capability: str) -> bool: + if capability in [ + "get_issue_comments", + ]: + return False + return True + + def set_pr(self, pr_url: str): + self.workspace_slug, self.repo_slug, self.pr_num = self._parse_pr_url(pr_url) + self.pr = self._get_pr() + + def get_repo_settings(self): + try: + contents = self.azure_devops_client.get_item_content( + repository_id=self.repo_slug, + project=self.workspace_slug, + download=False, + include_content_metadata=False, + include_content=True, + path=".pr_agent.toml", + ) + return list(contents)[0] + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to get repo settings, error: {e}") + return "" + + def get_files(self): + files = [] + for i in self.azure_devops_client.get_pull_request_commits( + project=self.workspace_slug, + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + ): + changes_obj = self.azure_devops_client.get_changes( + project=self.workspace_slug, + repository_id=self.repo_slug, + commit_id=i.commit_id, + ) + + for c in changes_obj.changes: + files.append(c["item"]["path"]) + return list(set(files)) + + def get_diff_files(self) -> list[FilePatchInfo]: + try: + + if self.diff_files: + return self.diff_files + + base_sha = self.pr.last_merge_target_commit + head_sha = self.pr.last_merge_source_commit + + # Get PR iterations + iterations = self.azure_devops_client.get_pull_request_iterations( + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + project=self.workspace_slug + ) + changes = None + if iterations: + iteration_id = iterations[-1].id # Get the last iteration (most recent changes) + + # Get changes for the iteration + changes = self.azure_devops_client.get_pull_request_iteration_changes( + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + iteration_id=iteration_id, + project=self.workspace_slug + ) + diff_files = [] + diffs = [] + diff_types = {} + if changes: + for change in changes.change_entries: + item = change.additional_properties.get('item', {}) + path = item.get('path', None) + if path: + diffs.append(path) + diff_types[path] = change.additional_properties.get('changeType', 'Unknown') + + # wrong implementation - gets all the files that were changed in any commit in the PR + # commits = self.azure_devops_client.get_pull_request_commits( + # project=self.workspace_slug, + # repository_id=self.repo_slug, + # pull_request_id=self.pr_num, + # ) + # + # diff_files = [] + # diffs = [] + # diff_types = {} + + # for c in commits: + # changes_obj = self.azure_devops_client.get_changes( + # project=self.workspace_slug, + # repository_id=self.repo_slug, + # commit_id=c.commit_id, + # ) + # for i in changes_obj.changes: + # if i["item"]["gitObjectType"] == "tree": + # continue + # diffs.append(i["item"]["path"]) + # diff_types[i["item"]["path"]] = i["changeType"] + # + # diffs = list(set(diffs)) + + diffs_original = diffs + diffs = filter_ignored(diffs_original, 'azure') + if diffs_original != diffs: + try: + get_logger().info(f"Filtered out [ignore] files for pull request:", extra= + {"files": diffs_original, # diffs is just a list of names + "filtered_files": diffs}) + except Exception: + pass + + invalid_files_names = [] + for file in diffs: + if not is_valid_file(file): + invalid_files_names.append(file) + continue + + version = GitVersionDescriptor( + version=head_sha.commit_id, version_type="commit" + ) + try: + new_file_content_str = self.azure_devops_client.get_item( + repository_id=self.repo_slug, + path=file, + project=self.workspace_slug, + version_descriptor=version, + download=False, + include_content=True, + ) + + new_file_content_str = new_file_content_str.content + except Exception as error: + get_logger().error(f"Failed to retrieve new file content of {file} at version {version}", error=error) + # get_logger().error( + # "Failed to retrieve new file content of %s at version %s. Error: %s", + # file, + # version, + # str(error), + # ) + new_file_content_str = "" + + edit_type = EDIT_TYPE.MODIFIED + if diff_types[file] == "add": + edit_type = EDIT_TYPE.ADDED + elif diff_types[file] == "delete": + edit_type = EDIT_TYPE.DELETED + elif diff_types[file] == "rename": + edit_type = EDIT_TYPE.RENAMED + + version = GitVersionDescriptor( + version=base_sha.commit_id, version_type="commit" + ) + try: + original_file_content_str = self.azure_devops_client.get_item( + repository_id=self.repo_slug, + path=file, + project=self.workspace_slug, + version_descriptor=version, + download=False, + include_content=True, + ) + original_file_content_str = original_file_content_str.content + except Exception as error: + get_logger().error(f"Failed to retrieve original file content of {file} at version {version}", error=error) + original_file_content_str = "" + + patch = load_large_diff( + file, new_file_content_str, original_file_content_str, show_warning=False + ).rstrip() + + # count number of lines added and removed + patch_lines = patch.splitlines(keepends=True) + num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) + num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) + + diff_files.append( + FilePatchInfo( + original_file_content_str, + new_file_content_str, + patch=patch, + filename=file, + edit_type=edit_type, + num_plus_lines=num_plus_lines, + num_minus_lines=num_minus_lines, + ) + ) + get_logger().info(f"Invalid files: {invalid_files_names}") + + self.diff_files = diff_files + return diff_files + except Exception as e: + get_logger().exception(f"Failed to get diff files, error: {e}") + return [] + + def publish_comment(self, pr_comment: str, is_temporary: bool = False, thread_context=None): + comment = Comment(content=pr_comment) + thread = CommentThread(comments=[comment], thread_context=thread_context, status=5) + thread_response = self.azure_devops_client.create_thread( + comment_thread=thread, + project=self.workspace_slug, + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + ) + response = {"thread_id": thread_response.id, "comment_id": thread_response.comments[0].id} + if is_temporary: + self.temp_comments.append(response) + return response + + def publish_description(self, pr_title: str, pr_body: str): + if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH: + + usage_guide_text='
    โœจ Describe tool usage guide:
    ' + ind = pr_body.find(usage_guide_text) + if ind != -1: + pr_body = pr_body[:ind] + + if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH: + changes_walkthrough_text = '## **Changes walkthrough**' + ind = pr_body.find(changes_walkthrough_text) + if ind != -1: + pr_body = pr_body[:ind] + + if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH: + trunction_message = " ... (description truncated due to length limit)" + pr_body = pr_body[:MAX_PR_DESCRIPTION_AZURE_LENGTH - len(trunction_message)] + trunction_message + get_logger().warning("PR description was truncated due to length limit") + try: + updated_pr = GitPullRequest() + updated_pr.title = pr_title + updated_pr.description = pr_body + self.azure_devops_client.update_pull_request( + project=self.workspace_slug, + repository_id=self.repo_slug, + pull_request_id=self.pr_num, + git_pull_request_to_update=updated_pr, + ) + except Exception as e: + get_logger().exception( + f"Could not update pull request {self.pr_num} description: {e}" + ) + + def remove_initial_comment(self): + try: + for comment in self.temp_comments: + self.remove_comment(comment) + except Exception as e: + get_logger().exception(f"Failed to remove temp comments, error: {e}") + + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): + self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)]) + + + def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, + absolute_position: int = None): + position, absolute_position = find_line_number_of_relevant_line_in_file(self.get_diff_files(), + relevant_file.strip('`'), + relevant_line_in_file, + absolute_position) + if position == -1: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") + subject_type = "FILE" + else: + subject_type = "LINE" + path = relevant_file.strip() + return dict(body=body, path=path, position=position, absolute_position=absolute_position) if subject_type == "LINE" else {} + + def publish_inline_comments(self, comments: list[dict], disable_fallback: bool = False): + overall_success = True + for comment in comments: + try: + self.publish_comment(comment["body"], + thread_context={ + "filePath": comment["path"], + "rightFileStart": { + "line": comment["absolute_position"], + "offset": comment["position"], + }, + "rightFileEnd": { + "line": comment["absolute_position"], + "offset": comment["position"], + }, + }) + if get_settings().config.verbosity_level >= 2: + get_logger().info( + f"Published code suggestion on {self.pr_num} at {comment['path']}" + ) + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish code suggestion, error: {e}") + overall_success = False + return overall_success + + def get_title(self): + return self.pr.title + + def get_languages(self): + languages = [] + files = self.azure_devops_client.get_items( + project=self.workspace_slug, + repository_id=self.repo_slug, + recursion_level="Full", + include_content_metadata=True, + include_links=False, + download=False, + ) + for f in files: + if f.git_object_type == "blob": + file_name, file_extension = os.path.splitext(f.path) + languages.append(file_extension[1:]) + + extension_counts = {} + for ext in languages: + if ext != "": + extension_counts[ext] = extension_counts.get(ext, 0) + 1 + + total_extensions = sum(extension_counts.values()) + + extension_percentages = { + ext: (count / total_extensions) * 100 + for ext, count in extension_counts.items() + } + + return extension_percentages + + def get_pr_branch(self): + pr_info = self.azure_devops_client.get_pull_request_by_id( + project=self.workspace_slug, pull_request_id=self.pr_num + ) + source_branch = pr_info.source_ref_name.split("/")[-1] + return source_branch + + def get_user_id(self): + return 0 + + def get_issue_comments(self): + threads = self.azure_devops_client.get_threads(repository_id=self.repo_slug, pull_request_id=self.pr_num, project=self.workspace_slug) + threads.reverse() + comment_list = [] + for thread in threads: + for comment in thread.comments: + if comment.content and comment not in comment_list: + comment.body = comment.content + comment.thread_id = thread.id + comment_list.append(comment) + return comment_list + + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: + return True + + def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: + return True + + @staticmethod + def _parse_pr_url(pr_url: str) -> Tuple[str, str, int]: + parsed_url = urlparse(pr_url) + + path_parts = parsed_url.path.strip("/").split("/") + if "pullrequest" not in path_parts: + raise ValueError( + "The provided URL does not appear to be a Azure DevOps PR URL" + ) + if len(path_parts) == 6: # "https://dev.azure.com/organization/project/_git/repo/pullrequest/1" + workspace_slug = path_parts[1] + repo_slug = path_parts[3] + pr_number = int(path_parts[5]) + elif len(path_parts) == 5: # 'https://organization.visualstudio.com/project/_git/repo/pullrequest/1' + workspace_slug = path_parts[0] + repo_slug = path_parts[2] + pr_number = int(path_parts[4]) + else: + raise ValueError("The provided URL does not appear to be a Azure DevOps PR URL") + + return workspace_slug, repo_slug, pr_number + + @staticmethod + def _get_azure_devops_client(): + org = get_settings().azure_devops.get("org", None) + pat = get_settings().azure_devops.get("pat", None) + + if not org: + raise ValueError("Azure DevOps organization is required") + + if pat: + auth_token = pat + else: + try: + # try to use azure default credentials + # see https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python + # for usage and env var configuration of user-assigned managed identity, local machine auth etc. + get_logger().info("No PAT found in settings, trying to use Azure Default Credentials.") + credentials = DefaultAzureCredential() + accessToken = credentials.get_token(ADO_APP_CLIENT_DEFAULT_ID) + auth_token = accessToken.token + except Exception as e: + get_logger().error(f"No PAT found in settings, and Azure Default Authentication failed, error: {e}") + raise + + credentials = BasicAuthentication("", auth_token) + + credentials = BasicAuthentication("", auth_token) + azure_devops_connection = Connection(base_url=org, creds=credentials) + azure_devops_client = azure_devops_connection.clients.get_git_client() + + return azure_devops_client + + def _get_repo(self): + if self.repo is None: + self.repo = self.azure_devops_client.get_repository( + project=self.workspace_slug, repository_id=self.repo_slug + ) + return self.repo + + def _get_pr(self): + self.pr = self.azure_devops_client.get_pull_request_by_id( + pull_request_id=self.pr_num, project=self.workspace_slug + ) + return self.pr + + def get_commit_messages(self): + return "" # not implemented yet + + def get_pr_id(self): + try: + pr_id = f"{self.workspace_slug}/{self.repo_slug}/{self.pr_num}" + return pr_id + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to get pr id, error: {e}") + return "" + + def publish_file_comments(self, file_comments: list) -> bool: + pass + diff --git a/pr_agent/git_providers/bitbucket_provider.py b/pr_agent/git_providers/bitbucket_provider.py index 07b922957..51ba07700 100644 --- a/pr_agent/git_providers/bitbucket_provider.py +++ b/pr_agent/git_providers/bitbucket_provider.py @@ -1,33 +1,127 @@ -import logging +import json from typing import Optional, Tuple from urllib.parse import urlparse import requests from atlassian.bitbucket import Cloud +from starlette_context import context -from ..algo.pr_processing import clip_tokens +from pr_agent.algo.types import FilePatchInfo, EDIT_TYPE +from ..algo.file_filter import filter_ignored +from ..algo.language_handler import is_valid_file +from ..algo.utils import find_line_number_of_relevant_line_in_file from ..config_loader import get_settings -from .git_provider import FilePatchInfo +from ..log import get_logger +from .git_provider import GitProvider, MAX_FILES_ALLOWED_FULL -class BitbucketProvider: - def __init__(self, pr_url: Optional[str] = None, incremental: Optional[bool] = False): +def _gef_filename(diff): + if diff.new.path: + return diff.new.path + return diff.old.path + + +class BitbucketProvider(GitProvider): + def __init__( + self, pr_url: Optional[str] = None, incremental: Optional[bool] = False + ): s = requests.Session() - s.headers['Authorization'] = f'Bearer {get_settings().get("BITBUCKET.BEARER_TOKEN", None)}' + try: + bearer = context.get("bitbucket_bearer_token", None) + s.headers["Authorization"] = f"Bearer {bearer}" + except Exception: + s.headers[ + "Authorization" + ] = f'Bearer {get_settings().get("BITBUCKET.BEARER_TOKEN", None)}' + s.headers["Content-Type"] = "application/json" + self.headers = s.headers self.bitbucket_client = Cloud(session=s) - + self.max_comment_length = 31000 self.workspace_slug = None self.repo_slug = None self.repo = None self.pr_num = None self.pr = None + self.pr_url = pr_url self.temp_comments = [] self.incremental = incremental + self.diff_files = None + self.git_files = None if pr_url: self.set_pr(pr_url) + self.bitbucket_comment_api_url = self.pr._BitbucketBase__data["links"]["comments"]["href"] + self.bitbucket_pull_request_api_url = self.pr._BitbucketBase__data["links"]['self']['href'] + + def get_repo_settings(self): + try: + url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/" + f"{self.pr.destination_branch}/.pr_agent.toml") + response = requests.request("GET", url, headers=self.headers) + if response.status_code == 404: # not found + return "" + contents = response.text.encode('utf-8') + return contents + except Exception: + return "" + + def publish_code_suggestions(self, code_suggestions: list) -> bool: + """ + Publishes code suggestions as comments on the PR. + """ + post_parameters_list = [] + for suggestion in code_suggestions: + body = suggestion["body"] + relevant_file = suggestion["relevant_file"] + relevant_lines_start = suggestion["relevant_lines_start"] + relevant_lines_end = suggestion["relevant_lines_end"] + + if not relevant_lines_start or relevant_lines_start == -1: + if get_settings().config.verbosity_level >= 2: + get_logger().exception( + f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}" + ) + continue + + if relevant_lines_end < relevant_lines_start: + if get_settings().config.verbosity_level >= 2: + get_logger().exception( + f"Failed to publish code suggestion, " + f"relevant_lines_end is {relevant_lines_end} and " + f"relevant_lines_start is {relevant_lines_start}" + ) + continue + + if relevant_lines_end > relevant_lines_start: + post_parameters = { + "body": body, + "path": relevant_file, + "line": relevant_lines_end, + "start_line": relevant_lines_start, + "start_side": "RIGHT", + } + else: # API is different for single line comments + post_parameters = { + "body": body, + "path": relevant_file, + "line": relevant_lines_start, + "side": "RIGHT", + } + post_parameters_list.append(post_parameters) + + try: + self.publish_inline_comments(post_parameters_list) + return True + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish code suggestion, error: {e}") + return False + + def publish_file_comments(self, file_comments: list) -> bool: + pass def is_supported(self, capability: str) -> bool: - if capability in ['get_issue_comments', 'create_inline_comment', 'publish_inline_comments', 'get_labels']: + if capability in ['get_issue_comments', 'publish_inline_comments', 'get_labels', 'gfm_markdown', + 'publish_file_comments']: return False return True @@ -36,64 +130,308 @@ def set_pr(self, pr_url: str): self.pr = self._get_pr() def get_files(self): - return [diff.new.path for diff in self.pr.diffstat()] + try: + git_files = context.get("git_files", None) + if git_files: + return git_files + self.git_files = [_gef_filename(diff) for diff in self.pr.diffstat()] + context["git_files"] = self.git_files + return self.git_files + except Exception: + if not self.git_files: + self.git_files = [_gef_filename(diff) for diff in self.pr.diffstat()] + return self.git_files def get_diff_files(self) -> list[FilePatchInfo]: - diffs = self.pr.diffstat() - diff_split = ['diff --git%s' % x for x in self.pr.diff().split('diff --git') if x.strip()] - + if self.diff_files: + return self.diff_files + + diffs_original = list(self.pr.diffstat()) + diffs = filter_ignored(diffs_original, 'bitbucket') + if diffs != diffs_original: + try: + names_original = [d.new.path for d in diffs_original] + names_kept = [d.new.path for d in diffs] + names_filtered = list(set(names_original) - set(names_kept)) + get_logger().info(f"Filtered out [ignore] files for PR", extra={ + 'original_files': names_original, + 'names_kept': names_kept, + 'names_filtered': names_filtered + + }) + except Exception as e: + pass + + # get the pr patches + try: + pr_patches = self.pr.diff() + except Exception as e: + # Try different encodings if UTF-8 fails + get_logger().warning(f"Failed to decode PR patch with utf-8, error: {e}") + encodings_to_try = ['iso-8859-1', 'latin-1', 'ascii', 'utf-16'] + pr_patches = None + for encoding in encodings_to_try: + try: + pr_patches = self.pr.diff(encoding=encoding) + get_logger().info(f"Successfully decoded PR patch with encoding {encoding}") + break + except UnicodeDecodeError: + continue + + if pr_patches is None: + raise ValueError(f"Failed to decode PR patch with encodings {encodings_to_try}") + + diff_split = ["diff --git" + x for x in pr_patches.split("diff --git") if x.strip()] + # filter all elements of 'diff_split' that are of indices in 'diffs_original' that are not in 'diffs' + if len(diff_split) > len(diffs) and len(diffs_original) == len(diff_split): + diff_split = [diff_split[i] for i in range(len(diff_split)) if diffs_original[i] in diffs] + if len(diff_split) != len(diffs): + get_logger().error(f"Error - failed to split the diff into {len(diffs)} parts") + return [] + # bitbucket diff has a header for each file, we need to remove it: + # "diff --git filename + # new file mode 100644 (optional) + # index caa56f0..61528d7 100644 + # --- a/pr_agent/cli_pip.py + # +++ b/pr_agent/cli_pip.py + # @@ -... @@" + for i, _ in enumerate(diff_split): + diff_split_lines = diff_split[i].splitlines() + if (len(diff_split_lines) >= 6) and \ + ((diff_split_lines[2].startswith("---") and + diff_split_lines[3].startswith("+++") and + diff_split_lines[4].startswith("@@")) or + (diff_split_lines[3].startswith("---") and # new or deleted file + diff_split_lines[4].startswith("+++") and + diff_split_lines[5].startswith("@@"))): + diff_split[i] = "\n".join(diff_split_lines[4:]) + else: + if diffs[i].data.get('lines_added', 0) == 0 and diffs[i].data.get('lines_removed', 0) == 0: + diff_split[i] = "" + elif len(diff_split_lines) <= 3: + diff_split[i] = "" + get_logger().info(f"Disregarding empty diff for file {_gef_filename(diffs[i])}") + else: + get_logger().warning(f"Bitbucket failed to get diff for file {_gef_filename(diffs[i])}") + diff_split[i] = "" + + invalid_files_names = [] diff_files = [] + counter_valid = 0 + # get full files for index, diff in enumerate(diffs): - original_file_content_str = self._get_pr_file_content(diff.old.get_data('links')) - new_file_content_str = self._get_pr_file_content(diff.new.get_data('links')) - diff_files.append(FilePatchInfo(original_file_content_str, new_file_content_str, - diff_split[index], diff.new.path)) + file_path = _gef_filename(diff) + if not is_valid_file(file_path): + invalid_files_names.append(file_path) + continue + + try: + counter_valid += 1 + if get_settings().get("bitbucket_app.avoid_full_files", False): + original_file_content_str = "" + new_file_content_str = "" + elif counter_valid < MAX_FILES_ALLOWED_FULL // 2: # factor 2 because bitbucket has limited API calls + if diff.old.get_data("links"): + original_file_content_str = self._get_pr_file_content( + diff.old.get_data("links")['self']['href']) + else: + original_file_content_str = "" + if diff.new.get_data("links"): + new_file_content_str = self._get_pr_file_content(diff.new.get_data("links")['self']['href']) + else: + new_file_content_str = "" + else: + if counter_valid == MAX_FILES_ALLOWED_FULL // 2: + get_logger().info( + f"Bitbucket too many files in PR, will avoid loading full content for rest of files") + original_file_content_str = "" + new_file_content_str = "" + except Exception as e: + get_logger().exception(f"Error - bitbucket failed to get file content, error: {e}") + original_file_content_str = "" + new_file_content_str = "" + + file_patch_canonic_structure = FilePatchInfo( + original_file_content_str, + new_file_content_str, + diff_split[index], + file_path, + ) + + if diff.data['status'] == 'added': + file_patch_canonic_structure.edit_type = EDIT_TYPE.ADDED + elif diff.data['status'] == 'removed': + file_patch_canonic_structure.edit_type = EDIT_TYPE.DELETED + elif diff.data['status'] == 'modified': + file_patch_canonic_structure.edit_type = EDIT_TYPE.MODIFIED + elif diff.data['status'] == 'renamed': + file_patch_canonic_structure.edit_type = EDIT_TYPE.RENAMED + diff_files.append(file_patch_canonic_structure) + + if invalid_files_names: + get_logger().info(f"Disregarding files with invalid extensions:\n{invalid_files_names}") + + self.diff_files = diff_files return diff_files + def get_latest_commit_url(self): + return self.pr.data['source']['commit']['links']['html']['href'] + + def get_comment_url(self, comment): + return comment.data['links']['html']['href'] + + def publish_persistent_comment(self, pr_comment: str, + initial_header: str, + update_header: bool = True, + name='review', + final_update_message=True): + try: + for comment in self.pr.comments(): + body = comment.raw + if initial_header in body: + latest_commit_url = self.get_latest_commit_url() + comment_url = self.get_comment_url(comment) + if update_header: + updated_header = f"{initial_header}\n\n#### ({name.capitalize()} updated until commit {latest_commit_url})\n" + pr_comment_updated = pr_comment.replace(initial_header, updated_header) + else: + pr_comment_updated = pr_comment + get_logger().info(f"Persistent mode - updating comment {comment_url} to latest {name} message") + d = {"content": {"raw": pr_comment_updated}} + response = comment._update_data(comment.put(None, data=d)) + if final_update_message: + self.publish_comment( + f"**[Persistent {name}]({comment_url})** updated to latest commit {latest_commit_url}") + return + except Exception as e: + get_logger().exception(f"Failed to update persistent review, error: {e}") + pass + self.publish_comment(pr_comment) + def publish_comment(self, pr_comment: str, is_temporary: bool = False): + pr_comment = self.limit_output_characters(pr_comment, self.max_comment_length) comment = self.pr.comment(pr_comment) if is_temporary: - self.temp_comments.append(comment['id']) + self.temp_comments.append(comment["id"]) + return comment + + def edit_comment(self, comment, body: str): + try: + body = self.limit_output_characters(body, self.max_comment_length) + comment.update(body) + except Exception as e: + get_logger().exception(f"Failed to update comment, error: {e}") def remove_initial_comment(self): try: for comment in self.temp_comments: - self.pr.delete(f'comments/{comment}') + self.remove_comment(comment) except Exception as e: - logging.exception(f"Failed to remove temp comments, error: {e}") + get_logger().exception(f"Failed to remove temp comments, error: {e}") - def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): - pass + def remove_comment(self, comment): + try: + self.pr.delete(f"comments/{comment}") + except Exception as e: + get_logger().exception(f"Failed to remove comment, error: {e}") + + # function to create_inline_comment + def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, + absolute_position: int = None): + body = self.limit_output_characters(body, self.max_comment_length) + position, absolute_position = find_line_number_of_relevant_line_in_file(self.get_diff_files(), + relevant_file.strip('`'), + relevant_line_in_file, + absolute_position) + if position == -1: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") + subject_type = "FILE" + else: + subject_type = "LINE" + path = relevant_file.strip() + return dict(body=body, path=path, position=absolute_position) if subject_type == "LINE" else {} + + def publish_inline_comment(self, comment: str, from_line: int, file: str, original_suggestion=None): + comment = self.limit_output_characters(comment, self.max_comment_length) + payload = json.dumps({ + "content": { + "raw": comment, + }, + "inline": { + "to": from_line, + "path": file + }, + }) + response = requests.request( + "POST", self.bitbucket_comment_api_url, data=payload, headers=self.headers + ) + return response - def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): - raise NotImplementedError("Bitbucket provider does not support creating inline comments yet") + def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: + if relevant_line_start == -1: + link = f"{self.pr_url}/#L{relevant_file}" + else: + link = f"{self.pr_url}/#L{relevant_file}T{relevant_line_start}" + return link + + def generate_link_to_relevant_line_number(self, suggestion) -> str: + try: + relevant_file = suggestion['relevant_file'].strip('`').strip("'").rstrip() + relevant_line_str = suggestion['relevant_line'].rstrip() + if not relevant_line_str: + return "" + + diff_files = self.get_diff_files() + position, absolute_position = find_line_number_of_relevant_line_in_file \ + (diff_files, relevant_file, relevant_line_str) + + if absolute_position != -1 and self.pr_url: + link = f"{self.pr_url}/#L{relevant_file}T{absolute_position}" + return link + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Failed adding line link, error: {e}") + + return "" def publish_inline_comments(self, comments: list[dict]): - raise NotImplementedError("Bitbucket provider does not support publishing inline comments yet") + for comment in comments: + if 'position' in comment: + self.publish_inline_comment(comment['body'], comment['position'], comment['path']) + elif 'start_line' in comment: # multi-line comment + # note that bitbucket does not seem to support range - only a comment on a single line - https://community.developer.atlassian.com/t/api-post-endpoint-for-inline-pull-request-comments/60452 + self.publish_inline_comment(comment['body'], comment['start_line'], comment['path']) + elif 'line' in comment: # single-line comment + self.publish_inline_comment(comment['body'], comment['line'], comment['path']) + else: + get_logger().error(f"Could not publish inline comment {comment}") def get_title(self): return self.pr.title def get_languages(self): - languages = {self._get_repo().get_data('language'): 0} + languages = {self._get_repo().get_data("language"): 0} return languages def get_pr_branch(self): return self.pr.source_branch - def get_pr_description(self): - max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) - if max_tokens: - return clip_tokens(self.pr.description, max_tokens) + def get_pr_owner_id(self) -> str | None: + return self.workspace_slug + + def get_pr_description_full(self): return self.pr.description def get_user_id(self): return 0 def get_issue_comments(self): - raise NotImplementedError("Bitbucket provider does not support issue comments yet") + raise NotImplementedError( + "Bitbucket provider does not support issue comments yet" + ) - def add_eyes_reaction(self, issue_comment_id: int) -> Optional[int]: + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: return True def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: @@ -102,14 +440,16 @@ def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: @staticmethod def _parse_pr_url(pr_url: str) -> Tuple[str, int]: parsed_url = urlparse(pr_url) - - if 'bitbucket.org' not in parsed_url.netloc: - raise ValueError("The provided URL is not a valid GitHub URL") - path_parts = parsed_url.path.strip('/').split('/') - - if len(path_parts) < 4 or path_parts[2] != 'pull-requests': - raise ValueError("The provided URL does not appear to be a Bitbucket PR URL") + if "bitbucket.org" not in parsed_url.netloc: + raise ValueError("The provided URL is not a valid Bitbucket URL") + + path_parts = parsed_url.path.strip("/").split("/") + + if len(path_parts) < 4 or path_parts[2] != "pull-requests": + raise ValueError( + "The provided URL does not appear to be a Bitbucket PR URL" + ) workspace_slug = path_parts[0] repo_slug = path_parts[1] @@ -122,14 +462,81 @@ def _parse_pr_url(pr_url: str) -> Tuple[str, int]: def _get_repo(self): if self.repo is None: - self.repo = self.bitbucket_client.workspaces.get(self.workspace_slug).repositories.get(self.repo_slug) + self.repo = self.bitbucket_client.workspaces.get( + self.workspace_slug + ).repositories.get(self.repo_slug) return self.repo def _get_pr(self): return self._get_repo().pullrequests.get(self.pr_num) + def get_pr_file_content(self, file_path: str, branch: str) -> str: + try: + if branch == self.pr.source_branch: + branch = self.pr.data["source"]["commit"]["hash"] + elif branch == self.pr.destination_branch: + branch = self.pr.data["destination"]["commit"]["hash"] + url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/" + f"{branch}/{file_path}") + response = requests.request("GET", url, headers=self.headers) + if response.status_code == 404: # not found + return "" + contents = response.text + return contents + except Exception: + return "" + + def create_or_update_pr_file(self, file_path: str, branch: str, contents="", message="") -> None: + url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/") + if not message: + if contents: + message = f"Update {file_path}" + else: + message = f"Create {file_path}" + files = {file_path: contents} + data = { + "message": message, + "branch": branch + } + headers = {'Authorization': self.headers['Authorization']} if 'Authorization' in self.headers else {} + try: + requests.request("POST", url, headers=headers, data=data, files=files) + except Exception: + get_logger().exception(f"Failed to create empty file {file_path} in branch {branch}") + def _get_pr_file_content(self, remote_link: str): - return "" + try: + response = requests.request("GET", remote_link, headers=self.headers) + if response.status_code == 404: # not found + return "" + contents = response.text + return contents + except Exception: + return "" def get_commit_messages(self): return "" # not implemented yet + + # bitbucket does not support labels + def publish_description(self, pr_title: str, description: str): + payload = json.dumps({ + "description": description, + "title": pr_title + + }) + + response = requests.request("PUT", self.bitbucket_pull_request_api_url, headers=self.headers, data=payload) + try: + if response.status_code != 200: + get_logger().info(f"Failed to update description, error code: {response.status_code}") + except: + pass + return response + + # bitbucket does not support labels + def publish_labels(self, pr_types: list): + pass + + # bitbucket does not support labels + def get_pr_labels(self, update=False): + pass diff --git a/pr_agent/git_providers/bitbucket_server_provider.py b/pr_agent/git_providers/bitbucket_server_provider.py new file mode 100644 index 000000000..7588075e5 --- /dev/null +++ b/pr_agent/git_providers/bitbucket_server_provider.py @@ -0,0 +1,456 @@ +from distutils.version import LooseVersion +from requests.exceptions import HTTPError +from typing import Optional, Tuple +from urllib.parse import quote_plus, urlparse + +from atlassian.bitbucket import Bitbucket + +from .git_provider import GitProvider +from ..algo.types import EDIT_TYPE, FilePatchInfo +from ..algo.language_handler import is_valid_file +from ..algo.utils import load_large_diff, find_line_number_of_relevant_line_in_file +from ..config_loader import get_settings +from ..log import get_logger + + +class BitbucketServerProvider(GitProvider): + def __init__( + self, pr_url: Optional[str] = None, incremental: Optional[bool] = False, + bitbucket_client: Optional[Bitbucket] = None, + ): + self.bitbucket_server_url = None + self.workspace_slug = None + self.repo_slug = None + self.repo = None + self.pr_num = None + self.pr = None + self.pr_url = pr_url + self.temp_comments = [] + self.incremental = incremental + self.diff_files = None + self.bitbucket_pull_request_api_url = pr_url + + self.bitbucket_server_url = self._parse_bitbucket_server(url=pr_url) + self.bitbucket_client = bitbucket_client or Bitbucket(url=self.bitbucket_server_url, + token=get_settings().get("BITBUCKET_SERVER.BEARER_TOKEN", + None)) + try: + self.bitbucket_api_version = LooseVersion(self.bitbucket_client.get("rest/api/1.0/application-properties").get('version')) + except Exception: + self.bitbucket_api_version = None + + if pr_url: + self.set_pr(pr_url) + + def get_repo_settings(self): + try: + content = self.bitbucket_client.get_content_of_file(self.workspace_slug, self.repo_slug, ".pr_agent.toml", self.get_pr_branch()) + + return content + except Exception as e: + if isinstance(e, HTTPError): + if e.response.status_code == 404: # not found + return "" + + get_logger().error(f"Failed to load .pr_agent.toml file, error: {e}") + return "" + + def get_pr_id(self): + return self.pr_num + + def publish_code_suggestions(self, code_suggestions: list) -> bool: + """ + Publishes code suggestions as comments on the PR. + """ + post_parameters_list = [] + for suggestion in code_suggestions: + body = suggestion["body"] + relevant_file = suggestion["relevant_file"] + relevant_lines_start = suggestion["relevant_lines_start"] + relevant_lines_end = suggestion["relevant_lines_end"] + + if not relevant_lines_start or relevant_lines_start == -1: + if get_settings().config.verbosity_level >= 2: + get_logger().exception( + f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}" + ) + continue + + if relevant_lines_end < relevant_lines_start: + if get_settings().config.verbosity_level >= 2: + get_logger().exception( + f"Failed to publish code suggestion, " + f"relevant_lines_end is {relevant_lines_end} and " + f"relevant_lines_start is {relevant_lines_start}" + ) + continue + + if relevant_lines_end > relevant_lines_start: + # Bitbucket does not support multi-line suggestions so use a code block instead - https://jira.atlassian.com/browse/BSERV-4553 + body = body.replace("```suggestion", "```") + post_parameters = { + "body": body, + "path": relevant_file, + "line": relevant_lines_end, + "start_line": relevant_lines_start, + "start_side": "RIGHT", + } + else: # API is different for single line comments + post_parameters = { + "body": body, + "path": relevant_file, + "line": relevant_lines_start, + "side": "RIGHT", + } + post_parameters_list.append(post_parameters) + + try: + self.publish_inline_comments(post_parameters_list) + return True + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish code suggestion, error: {e}") + return False + + def publish_file_comments(self, file_comments: list) -> bool: + pass + + def is_supported(self, capability: str) -> bool: + if capability in ['get_issue_comments', 'get_labels', 'gfm_markdown', 'publish_file_comments']: + return False + return True + + def set_pr(self, pr_url: str): + self.workspace_slug, self.repo_slug, self.pr_num = self._parse_pr_url(pr_url) + self.pr = self._get_pr() + + def get_file(self, path: str, commit_id: str): + file_content = "" + try: + file_content = self.bitbucket_client.get_content_of_file(self.workspace_slug, + self.repo_slug, + path, + commit_id) + except HTTPError as e: + get_logger().debug(f"File {path} not found at commit id: {commit_id}") + return file_content + + def get_files(self): + changes = self.bitbucket_client.get_pull_requests_changes(self.workspace_slug, self.repo_slug, self.pr_num) + diffstat = [change["path"]['toString'] for change in changes] + return diffstat + + #gets the best common ancestor: https://git-scm.com/docs/git-merge-base + @staticmethod + def get_best_common_ancestor(source_commits_list, destination_commits_list, guaranteed_common_ancestor) -> str: + destination_commit_hashes = {commit['id'] for commit in destination_commits_list} | {guaranteed_common_ancestor} + + for commit in source_commits_list: + for parent_commit in commit['parents']: + if parent_commit['id'] in destination_commit_hashes: + return parent_commit['id'] + + return guaranteed_common_ancestor + + def get_diff_files(self) -> list[FilePatchInfo]: + if self.diff_files: + return self.diff_files + + head_sha = self.pr.fromRef['latestCommit'] + + # if Bitbucket api version is >= 8.16 then use the merge-base api for 2-way diff calculation + if self.bitbucket_api_version is not None and self.bitbucket_api_version >= LooseVersion("8.16"): + try: + base_sha = self.bitbucket_client.get(self._get_merge_base())['id'] + except Exception as e: + get_logger().error(f"Failed to get the best common ancestor for PR: {self.pr_url}, \nerror: {e}") + raise e + else: + source_commits_list = list(self.bitbucket_client.get_pull_requests_commits( + self.workspace_slug, + self.repo_slug, + self.pr_num + )) + # if Bitbucket api version is None or < 7.0 then do a simple diff with a guaranteed common ancestor + base_sha = source_commits_list[-1]['parents'][0]['id'] + # if Bitbucket api version is 7.0-8.15 then use 2-way diff functionality for the base_sha + if self.bitbucket_api_version is not None and self.bitbucket_api_version >= LooseVersion("7.0"): + try: + destination_commits = list( + self.bitbucket_client.get_commits(self.workspace_slug, self.repo_slug, base_sha, + self.pr.toRef['latestCommit'])) + base_sha = self.get_best_common_ancestor(source_commits_list, destination_commits, base_sha) + except Exception as e: + get_logger().error( + f"Failed to get the commit list for calculating best common ancestor for PR: {self.pr_url}, \nerror: {e}") + raise e + + diff_files = [] + original_file_content_str = "" + new_file_content_str = "" + + changes = self.bitbucket_client.get_pull_requests_changes(self.workspace_slug, self.repo_slug, self.pr_num) + for change in changes: + file_path = change['path']['toString'] + if not is_valid_file(file_path.split("/")[-1]): + get_logger().info(f"Skipping a non-code file: {file_path}") + continue + + match change['type']: + case 'ADD': + edit_type = EDIT_TYPE.ADDED + new_file_content_str = self.get_file(file_path, head_sha) + if isinstance(new_file_content_str, (bytes, bytearray)): + new_file_content_str = new_file_content_str.decode("utf-8") + original_file_content_str = "" + case 'DELETE': + edit_type = EDIT_TYPE.DELETED + new_file_content_str = "" + original_file_content_str = self.get_file(file_path, base_sha) + if isinstance(original_file_content_str, (bytes, bytearray)): + original_file_content_str = original_file_content_str.decode("utf-8") + case 'RENAME': + edit_type = EDIT_TYPE.RENAMED + case _: + edit_type = EDIT_TYPE.MODIFIED + original_file_content_str = self.get_file(file_path, base_sha) + if isinstance(original_file_content_str, (bytes, bytearray)): + original_file_content_str = original_file_content_str.decode("utf-8") + new_file_content_str = self.get_file(file_path, head_sha) + if isinstance(new_file_content_str, (bytes, bytearray)): + new_file_content_str = new_file_content_str.decode("utf-8") + + patch = load_large_diff(file_path, new_file_content_str, original_file_content_str) + + diff_files.append( + FilePatchInfo( + original_file_content_str, + new_file_content_str, + patch, + file_path, + edit_type=edit_type, + ) + ) + + self.diff_files = diff_files + return diff_files + + def publish_comment(self, pr_comment: str, is_temporary: bool = False): + if not is_temporary: + self.bitbucket_client.add_pull_request_comment(self.workspace_slug, self.repo_slug, self.pr_num, pr_comment) + + def remove_initial_comment(self): + try: + for comment in self.temp_comments: + self.remove_comment(comment) + except ValueError as e: + get_logger().exception(f"Failed to remove temp comments, error: {e}") + + def remove_comment(self, comment): + pass + + # function to create_inline_comment + def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, + absolute_position: int = None): + + position, absolute_position = find_line_number_of_relevant_line_in_file( + self.get_diff_files(), + relevant_file.strip('`'), + relevant_line_in_file, + absolute_position + ) + if position == -1: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") + subject_type = "FILE" + else: + subject_type = "LINE" + path = relevant_file.strip() + return dict(body=body, path=path, position=absolute_position) if subject_type == "LINE" else {} + + def publish_inline_comment(self, comment: str, from_line: int, file: str, original_suggestion=None): + payload = { + "text": comment, + "severity": "NORMAL", + "anchor": { + "diffType": "EFFECTIVE", + "path": file, + "lineType": "ADDED", + "line": from_line, + "fileType": "TO" + } + } + + try: + self.bitbucket_client.post(self._get_pr_comments_path(), data=payload) + except Exception as e: + get_logger().error(f"Failed to publish inline comment to '{file}' at line {from_line}, error: {e}") + raise e + + def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: + if relevant_line_start == -1: + link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}" + else: + link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}?t={relevant_line_start}" + return link + + def generate_link_to_relevant_line_number(self, suggestion) -> str: + try: + relevant_file = suggestion['relevant_file'].strip('`').strip("'").rstrip() + relevant_line_str = suggestion['relevant_line'].rstrip() + if not relevant_line_str: + return "" + + diff_files = self.get_diff_files() + position, absolute_position = find_line_number_of_relevant_line_in_file \ + (diff_files, relevant_file, relevant_line_str) + + if absolute_position != -1: + if self.pr: + link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}?t={absolute_position}" + return link + else: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Failed adding line link to '{relevant_file}' since PR not set") + else: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Failed adding line link to '{relevant_file}' since position not found") + + if absolute_position != -1 and self.pr_url: + link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}?t={absolute_position}" + return link + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Failed adding line link to '{relevant_file}', error: {e}") + + return "" + + def publish_inline_comments(self, comments: list[dict]): + for comment in comments: + if 'position' in comment: + self.publish_inline_comment(comment['body'], comment['position'], comment['path']) + elif 'start_line' in comment: # multi-line comment + # note that bitbucket does not seem to support range - only a comment on a single line - https://community.developer.atlassian.com/t/api-post-endpoint-for-inline-pull-request-comments/60452 + self.publish_inline_comment(comment['body'], comment['start_line'], comment['path']) + elif 'line' in comment: # single-line comment + self.publish_inline_comment(comment['body'], comment['line'], comment['path']) + else: + get_logger().error(f"Could not publish inline comment: {comment}") + + def get_title(self): + return self.pr.title + + def get_languages(self): + return {"yaml": 0} # devops LOL + + def get_pr_branch(self): + return self.pr.fromRef['displayId'] + + def get_pr_owner_id(self) -> str | None: + return self.workspace_slug + + def get_pr_description_full(self): + if hasattr(self.pr, "description"): + return self.pr.description + else: + return None + + def get_user_id(self): + return 0 + + def get_issue_comments(self): + raise NotImplementedError( + "Bitbucket provider does not support issue comments yet" + ) + + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: + return True + + def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: + return True + + @staticmethod + def _parse_bitbucket_server(url: str) -> str: + # pr url format: f"{bitbucket_server}/projects/{project_name}/repos/{repository_name}/pull-requests/{pr_id}" + parsed_url = urlparse(url) + server_path = parsed_url.path.split("/projects/") + if len(server_path) > 1: + server_path = server_path[0].strip("/") + return f"{parsed_url.scheme}://{parsed_url.netloc}/{server_path}".strip("/") + return f"{parsed_url.scheme}://{parsed_url.netloc}" + + @staticmethod + def _parse_pr_url(pr_url: str) -> Tuple[str, str, int]: + # pr url format: f"{bitbucket_server}/projects/{project_name}/repos/{repository_name}/pull-requests/{pr_id}" + parsed_url = urlparse(pr_url) + + path_parts = parsed_url.path.strip("/").split("/") + + try: + projects_index = path_parts.index("projects") + except ValueError as e: + raise ValueError(f"The provided URL '{pr_url}' does not appear to be a Bitbucket PR URL") + + path_parts = path_parts[projects_index:] + + if len(path_parts) < 6 or path_parts[2] != "repos" or path_parts[4] != "pull-requests": + raise ValueError( + f"The provided URL '{pr_url}' does not appear to be a Bitbucket PR URL" + ) + + workspace_slug = path_parts[1] + repo_slug = path_parts[3] + try: + pr_number = int(path_parts[5]) + except ValueError as e: + raise ValueError(f"Unable to convert PR number '{path_parts[5]}' to integer") from e + + return workspace_slug, repo_slug, pr_number + + def _get_repo(self): + if self.repo is None: + self.repo = self.bitbucket_client.get_repo(self.workspace_slug, self.repo_slug) + return self.repo + + def _get_pr(self): + try: + pr = self.bitbucket_client.get_pull_request(self.workspace_slug, self.repo_slug, + pull_request_id=self.pr_num) + return type('new_dict', (object,), pr) + except Exception as e: + get_logger().error(f"Failed to get pull request, error: {e}") + raise e + + def _get_pr_file_content(self, remote_link: str): + return "" + + def get_commit_messages(self): + return "" + + # bitbucket does not support labels + def publish_description(self, pr_title: str, description: str): + payload = { + "version": self.pr.version, + "description": description, + "title": pr_title, + "reviewers": self.pr.reviewers # needs to be sent otherwise gets wiped + } + try: + self.bitbucket_client.update_pull_request(self.workspace_slug, self.repo_slug, str(self.pr_num), payload) + except Exception as e: + get_logger().error(f"Failed to update pull request, error: {e}") + raise e + + # bitbucket does not support labels + def publish_labels(self, pr_types: list): + pass + + # bitbucket does not support labels + def get_pr_labels(self, update=False): + pass + + def _get_pr_comments_path(self): + return f"rest/api/latest/projects/{self.workspace_slug}/repos/{self.repo_slug}/pull-requests/{self.pr_num}/comments" + + def _get_merge_base(self): + return f"rest/api/latest/projects/{self.workspace_slug}/repos/{self.repo_slug}/pull-requests/{self.pr_num}/merge-base" diff --git a/pr_agent/git_providers/codecommit_client.py b/pr_agent/git_providers/codecommit_client.py new file mode 100644 index 000000000..5f18c90da --- /dev/null +++ b/pr_agent/git_providers/codecommit_client.py @@ -0,0 +1,277 @@ +import boto3 +import botocore + + +class CodeCommitDifferencesResponse: + """ + CodeCommitDifferencesResponse is the response object returned from our get_differences() function. + It maps the JSON response to member variables of this class. + """ + + def __init__(self, json: dict): + before_blob = json.get("beforeBlob", {}) + after_blob = json.get("afterBlob", {}) + + self.before_blob_id = before_blob.get("blobId", "") + self.before_blob_path = before_blob.get("path", "") + self.after_blob_id = after_blob.get("blobId", "") + self.after_blob_path = after_blob.get("path", "") + self.change_type = json.get("changeType", "") + + +class CodeCommitPullRequestResponse: + """ + CodeCommitPullRequestResponse is the response object returned from our get_pr() function. + It maps the JSON response to member variables of this class. + """ + + def __init__(self, json: dict): + self.title = json.get("title", "") + self.description = json.get("description", "") + + self.targets = [] + for target in json.get("pullRequestTargets", []): + self.targets.append(CodeCommitPullRequestResponse.CodeCommitPullRequestTarget(target)) + + class CodeCommitPullRequestTarget: + """ + CodeCommitPullRequestTarget is a subclass of CodeCommitPullRequestResponse that + holds details about an individual target commit. + """ + + def __init__(self, json: dict): + self.source_commit = json.get("sourceCommit", "") + self.source_branch = json.get("sourceReference", "") + self.destination_commit = json.get("destinationCommit", "") + self.destination_branch = json.get("destinationReference", "") + + +class CodeCommitClient: + """ + CodeCommitClient is a wrapper around the AWS boto3 SDK for the CodeCommit client + """ + + def __init__(self): + self.boto_client = None + + def is_supported(self, capability: str) -> bool: + if capability in ["gfm_markdown"]: + return False + return True + + def _connect_boto_client(self): + try: + self.boto_client = boto3.client("codecommit") + except Exception as e: + raise ValueError(f"Failed to connect to AWS CodeCommit: {e}") from e + + def get_differences(self, repo_name: int, destination_commit: str, source_commit: str): + """ + Get the differences between two commits in CodeCommit. + + Args: + - repo_name: Name of the repository + - destination_commit: Commit hash you want to merge into (the "before" hash) (usually on the main or master branch) + - source_commit: Commit hash of the code you are adding (the "after" branch) + + Returns: + - List of CodeCommitDifferencesResponse objects + + Boto3 Documentation: + - aws codecommit get-differences + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_differences.html + """ + if self.boto_client is None: + self._connect_boto_client() + + # The differences response from AWS is paginated, so we need to iterate through the pages to get all the differences. + differences = [] + try: + paginator = self.boto_client.get_paginator("get_differences") + for page in paginator.paginate( + repositoryName=repo_name, + beforeCommitSpecifier=destination_commit, + afterCommitSpecifier=source_commit, + ): + differences.extend(page.get("differences", [])) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': + raise ValueError(f"CodeCommit cannot retrieve differences: Repository does not exist: {repo_name}") from e + raise ValueError(f"CodeCommit cannot retrieve differences for {source_commit}..{destination_commit}") from e + except Exception as e: + raise ValueError(f"CodeCommit cannot retrieve differences for {source_commit}..{destination_commit}") from e + + output = [] + for json in differences: + output.append(CodeCommitDifferencesResponse(json)) + return output + + def get_file(self, repo_name: str, file_path: str, sha_hash: str, optional: bool = False): + """ + Retrieve a file from CodeCommit. + + Args: + - repo_name: Name of the repository + - file_path: Path to the file you are retrieving + - sha_hash: Commit hash of the file you are retrieving + + Returns: + - File contents + + Boto3 Documentation: + - aws codecommit get_file + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_file.html + """ + if not file_path: + return "" + + if self.boto_client is None: + self._connect_boto_client() + + try: + response = self.boto_client.get_file(repositoryName=repo_name, commitSpecifier=sha_hash, filePath=file_path) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': + raise ValueError(f"CodeCommit cannot retrieve PR: Repository does not exist: {repo_name}") from e + # if the file does not exist, but is flagged as optional, then return an empty string + if optional and e.response["Error"]["Code"] == 'FileDoesNotExistException': + return "" + raise ValueError(f"CodeCommit cannot retrieve file '{file_path}' from repository '{repo_name}'") from e + except Exception as e: + raise ValueError(f"CodeCommit cannot retrieve file '{file_path}' from repository '{repo_name}'") from e + if "fileContent" not in response: + raise ValueError(f"File content is empty for file: {file_path}") + + return response.get("fileContent", "") + + def get_pr(self, repo_name: str, pr_number: int): + """ + Get a information about a CodeCommit PR. + + Args: + - repo_name: Name of the repository + - pr_number: The PR number you are requesting + + Returns: + - CodeCommitPullRequestResponse object + + Boto3 Documentation: + - aws codecommit get_pull_request + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_pull_request.html + """ + if self.boto_client is None: + self._connect_boto_client() + + try: + response = self.boto_client.get_pull_request(pullRequestId=str(pr_number)) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': + raise ValueError(f"CodeCommit cannot retrieve PR: PR number does not exist: {pr_number}") from e + if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': + raise ValueError(f"CodeCommit cannot retrieve PR: Repository does not exist: {repo_name}") from e + raise ValueError(f"CodeCommit cannot retrieve PR: {pr_number}: boto client error") from e + except Exception as e: + raise ValueError(f"CodeCommit cannot retrieve PR: {pr_number}") from e + + if "pullRequest" not in response: + raise ValueError("CodeCommit PR number not found: {pr_number}") + + return CodeCommitPullRequestResponse(response.get("pullRequest", {})) + + def publish_description(self, pr_number: int, pr_title: str, pr_body: str): + """ + Set the title and description on a pull request + + Args: + - pr_number: the AWS CodeCommit pull request number + - pr_title: title of the pull request + - pr_body: body of the pull request + + Returns: + - None + + Boto3 Documentation: + - aws codecommit update_pull_request_title + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/update_pull_request_title.html + - aws codecommit update_pull_request_description + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/update_pull_request_description.html + """ + if self.boto_client is None: + self._connect_boto_client() + + try: + self.boto_client.update_pull_request_title(pullRequestId=str(pr_number), title=pr_title) + self.boto_client.update_pull_request_description(pullRequestId=str(pr_number), description=pr_body) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': + raise ValueError(f"PR number does not exist: {pr_number}") from e + if e.response["Error"]["Code"] == 'InvalidTitleException': + raise ValueError(f"Invalid title for PR number: {pr_number}") from e + if e.response["Error"]["Code"] == 'InvalidDescriptionException': + raise ValueError(f"Invalid description for PR number: {pr_number}") from e + if e.response["Error"]["Code"] == 'PullRequestAlreadyClosedException': + raise ValueError(f"PR is already closed: PR number: {pr_number}") from e + raise ValueError(f"Boto3 client error calling publish_description") from e + except Exception as e: + raise ValueError(f"Error calling publish_description") from e + + def publish_comment(self, repo_name: str, pr_number: int, destination_commit: str, source_commit: str, comment: str, annotation_file: str = None, annotation_line: int = None): + """ + Publish a comment to a pull request + + Args: + - repo_name: name of the repository + - pr_number: number of the pull request + - destination_commit: The commit hash you want to merge into (the "before" hash) (usually on the main or master branch) + - source_commit: The commit hash of the code you are adding (the "after" branch) + - comment: The comment you want to publish + - annotation_file: The file you want to annotate (optional) + - annotation_line: The line number you want to annotate (optional) + + Comment annotations for CodeCommit are different than GitHub. + CodeCommit only designates the starting line number for the comment. + It does not support the ending line number to highlight a range of lines. + + Returns: + - None + + Boto3 Documentation: + - aws codecommit post_comment_for_pull_request + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/post_comment_for_pull_request.html + """ + if self.boto_client is None: + self._connect_boto_client() + + try: + # If the comment has code annotations, + # then set the file path and line number in the location dictionary + if annotation_file and annotation_line: + self.boto_client.post_comment_for_pull_request( + pullRequestId=str(pr_number), + repositoryName=repo_name, + beforeCommitId=destination_commit, + afterCommitId=source_commit, + content=comment, + location={ + "filePath": annotation_file, + "filePosition": annotation_line, + "relativeFileVersion": "AFTER", + }, + ) + else: + # The comment does not have code annotations + self.boto_client.post_comment_for_pull_request( + pullRequestId=str(pr_number), + repositoryName=repo_name, + beforeCommitId=destination_commit, + afterCommitId=source_commit, + content=comment, + ) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': + raise ValueError(f"Repository does not exist: {repo_name}") from e + if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': + raise ValueError(f"PR number does not exist: {pr_number}") from e + raise ValueError(f"Boto3 client error calling post_comment_for_pull_request") from e + except Exception as e: + raise ValueError(f"Error calling post_comment_for_pull_request") from e diff --git a/pr_agent/git_providers/codecommit_provider.py b/pr_agent/git_providers/codecommit_provider.py new file mode 100644 index 000000000..89a0254df --- /dev/null +++ b/pr_agent/git_providers/codecommit_provider.py @@ -0,0 +1,495 @@ +import os +import re +from collections import Counter +from typing import List, Optional, Tuple +from urllib.parse import urlparse + +from pr_agent.git_providers.codecommit_client import CodeCommitClient +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo +from ..algo.utils import load_large_diff +from .git_provider import GitProvider +from ..config_loader import get_settings +from ..log import get_logger +from pr_agent.algo.language_handler import is_valid_file + +class PullRequestCCMimic: + """ + This class mimics the PullRequest class from the PyGithub library for the CodeCommitProvider. + """ + + def __init__(self, title: str, diff_files: List[FilePatchInfo]): + self.title = title + self.diff_files = diff_files + self.description = None + self.source_commit = None + self.source_branch = None # the branch containing your new code changes + self.destination_commit = None + self.destination_branch = None # the branch you are going to merge into + + +class CodeCommitFile: + """ + This class represents a file in a pull request in CodeCommit. + """ + + def __init__( + self, + a_path: str, + a_blob_id: str, + b_path: str, + b_blob_id: str, + edit_type: EDIT_TYPE, + ): + self.a_path = a_path + self.a_blob_id = a_blob_id + self.b_path = b_path + self.b_blob_id = b_blob_id + self.edit_type: EDIT_TYPE = edit_type + self.filename = b_path if b_path else a_path + + +class CodeCommitProvider(GitProvider): + """ + This class implements the GitProvider interface for AWS CodeCommit repositories. + """ + + def __init__(self, pr_url: Optional[str] = None, incremental: Optional[bool] = False): + self.codecommit_client = CodeCommitClient() + self.aws_client = None + self.repo_name = None + self.pr_num = None + self.pr = None + self.diff_files = None + self.git_files = None + self.pr_url = pr_url + if pr_url: + self.set_pr(pr_url) + + def provider_name(self): + return "CodeCommit" + + def is_supported(self, capability: str) -> bool: + if capability in [ + "get_issue_comments", + "create_inline_comment", + "publish_inline_comments", + "get_labels", + "gfm_markdown" + ]: + return False + return True + + def set_pr(self, pr_url: str): + self.repo_name, self.pr_num = self._parse_pr_url(pr_url) + self.pr = self._get_pr() + + def get_files(self) -> list[CodeCommitFile]: + # bring files from CodeCommit only once + if self.git_files: + return self.git_files + + self.git_files = [] + differences = self.codecommit_client.get_differences(self.repo_name, self.pr.destination_commit, self.pr.source_commit) + for item in differences: + self.git_files.append(CodeCommitFile(item.before_blob_path, + item.before_blob_id, + item.after_blob_path, + item.after_blob_id, + CodeCommitProvider._get_edit_type(item.change_type))) + return self.git_files + + def get_diff_files(self) -> list[FilePatchInfo]: + """ + Retrieves the list of files that have been modified, added, deleted, or renamed in a pull request in CodeCommit, + along with their content and patch information. + + Returns: + diff_files (List[FilePatchInfo]): List of FilePatchInfo objects representing the modified, added, deleted, + or renamed files in the merge request. + """ + # bring files from CodeCommit only once + if self.diff_files: + return self.diff_files + + self.diff_files = [] + + files = self.get_files() + for diff_item in files: + patch_filename = "" + if diff_item.a_blob_id is not None: + patch_filename = diff_item.a_path + original_file_content_str = self.codecommit_client.get_file( + self.repo_name, diff_item.a_path, self.pr.destination_commit) + if isinstance(original_file_content_str, (bytes, bytearray)): + original_file_content_str = original_file_content_str.decode("utf-8") + else: + original_file_content_str = "" + + if diff_item.b_blob_id is not None: + patch_filename = diff_item.b_path + new_file_content_str = self.codecommit_client.get_file(self.repo_name, diff_item.b_path, self.pr.source_commit) + if isinstance(new_file_content_str, (bytes, bytearray)): + new_file_content_str = new_file_content_str.decode("utf-8") + else: + new_file_content_str = "" + + patch = load_large_diff(patch_filename, new_file_content_str, original_file_content_str) + + # Store the diffs as a list of FilePatchInfo objects + info = FilePatchInfo( + original_file_content_str, + new_file_content_str, + patch, + diff_item.b_path, + edit_type=diff_item.edit_type, + old_filename=None + if diff_item.a_path == diff_item.b_path + else diff_item.a_path, + ) + # Only add valid files to the diff list + # "bad extensions" are set in the language_extensions.toml file + # a "valid file" is one that is not in the "bad extensions" list + if is_valid_file(info.filename): + self.diff_files.append(info) + + return self.diff_files + + def publish_description(self, pr_title: str, pr_body: str): + try: + self.codecommit_client.publish_description( + pr_number=self.pr_num, + pr_title=pr_title, + pr_body=CodeCommitProvider._add_additional_newlines(pr_body), + ) + except Exception as e: + raise ValueError(f"CodeCommit Cannot publish description for PR: {self.pr_num}") from e + + def publish_comment(self, pr_comment: str, is_temporary: bool = False): + if is_temporary: + get_logger().info(pr_comment) + return + + pr_comment = CodeCommitProvider._remove_markdown_html(pr_comment) + pr_comment = CodeCommitProvider._add_additional_newlines(pr_comment) + + try: + self.codecommit_client.publish_comment( + repo_name=self.repo_name, + pr_number=self.pr_num, + destination_commit=self.pr.destination_commit, + source_commit=self.pr.source_commit, + comment=pr_comment, + ) + except Exception as e: + raise ValueError(f"CodeCommit Cannot publish comment for PR: {self.pr_num}") from e + + def publish_code_suggestions(self, code_suggestions: list) -> bool: + counter = 1 + for suggestion in code_suggestions: + # Verify that each suggestion has the required keys + if not all(key in suggestion for key in ["body", "relevant_file", "relevant_lines_start"]): + get_logger().warning(f"Skipping code suggestion #{counter}: Each suggestion must have 'body', 'relevant_file', 'relevant_lines_start' keys") + continue + + # Publish the code suggestion to CodeCommit + try: + get_logger().debug(f"Code Suggestion #{counter} in file: {suggestion['relevant_file']}: {suggestion['relevant_lines_start']}") + self.codecommit_client.publish_comment( + repo_name=self.repo_name, + pr_number=self.pr_num, + destination_commit=self.pr.destination_commit, + source_commit=self.pr.source_commit, + comment=suggestion["body"], + annotation_file=suggestion["relevant_file"], + annotation_line=suggestion["relevant_lines_start"], + ) + except Exception as e: + raise ValueError(f"CodeCommit Cannot publish code suggestions for PR: {self.pr_num}") from e + + counter += 1 + + # The calling function passes in a list of code suggestions, and this function publishes each suggestion one at a time. + # If we were to return False here, the calling function will attempt to publish the same list of code suggestions again, one at a time. + # Since this function publishes the suggestions one at a time anyway, we always return True here to avoid the retry. + return True + + def publish_labels(self, labels): + return [""] # not implemented yet + + def get_pr_labels(self, update=False): + return [""] # not implemented yet + + def remove_initial_comment(self): + return "" # not implemented yet + + def remove_comment(self, comment): + return "" # not implemented yet + + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/post_comment_for_compared_commit.html + raise NotImplementedError("CodeCommit provider does not support publishing inline comments yet") + + def publish_inline_comments(self, comments: list[dict]): + raise NotImplementedError("CodeCommit provider does not support publishing inline comments yet") + + def get_title(self): + return self.pr.title + + def get_pr_id(self): + """ + Returns the PR ID in the format: "repo_name/pr_number". + Note: This is an internal identifier for PR-Agent, + and is not the same as the CodeCommit PR identifier. + """ + try: + pr_id = f"{self.repo_name}/{self.pr_num}" + return pr_id + except: + return "" + + def get_languages(self): + """ + Returns a dictionary of languages, containing the percentage of each language used in the PR. + + Returns: + - dict: A dictionary where each key is a language name and the corresponding value is the percentage of that language in the PR. + """ + commit_files = self.get_files() + filenames = [ item.filename for item in commit_files ] + extensions = CodeCommitProvider._get_file_extensions(filenames) + + # Calculate the percentage of each file extension in the PR + percentages = CodeCommitProvider._get_language_percentages(extensions) + + # The global language_extension_map is a dictionary of languages, + # where each dictionary item is a BoxList of extensions. + # We want a dictionary of extensions, + # where each dictionary item is a language name. + # We build that language->extension dictionary here in main_extensions_flat. + main_extensions_flat = {} + language_extension_map_org = get_settings().language_extension_map_org + language_extension_map = {k.lower(): v for k, v in language_extension_map_org.items()} + for language, extensions in language_extension_map.items(): + for ext in extensions: + main_extensions_flat[ext] = language + + # Map the file extension/languages to percentages + languages = {} + for ext, pct in percentages.items(): + languages[main_extensions_flat.get(ext, "")] = pct + + return languages + + def get_pr_branch(self): + return self.pr.source_branch + + def get_pr_description_full(self) -> str: + return self.pr.description + + def get_user_id(self): + return -1 # not implemented yet + + def get_issue_comments(self): + raise NotImplementedError("CodeCommit provider does not support issue comments yet") + + def get_repo_settings(self): + # a local ".pr_agent.toml" settings file is optional + settings_filename = ".pr_agent.toml" + return self.codecommit_client.get_file(self.repo_name, settings_filename, self.pr.source_commit, optional=True) + + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: + get_logger().info("CodeCommit provider does not support eyes reaction yet") + return True + + def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: + get_logger().info("CodeCommit provider does not support removing reactions yet") + return True + + @staticmethod + def _parse_pr_url(pr_url: str) -> Tuple[str, int]: + """ + Parse the CodeCommit PR URL and return the repository name and PR number. + + Args: + - pr_url: the full AWS CodeCommit pull request URL + + Returns: + - Tuple[str, int]: A tuple containing the repository name and PR number. + """ + # Example PR URL: + # https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/__MY_REPO__/pull-requests/123456" + parsed_url = urlparse(pr_url) + + if not CodeCommitProvider._is_valid_codecommit_hostname(parsed_url.netloc): + raise ValueError(f"The provided URL is not a valid CodeCommit URL: {pr_url}") + + path_parts = parsed_url.path.strip("/").split("/") + + if ( + len(path_parts) < 6 + or path_parts[0] != "codesuite" + or path_parts[1] != "codecommit" + or path_parts[2] != "repositories" + or path_parts[4] != "pull-requests" + ): + raise ValueError(f"The provided URL does not appear to be a CodeCommit PR URL: {pr_url}") + + repo_name = path_parts[3] + + try: + pr_number = int(path_parts[5]) + except ValueError as e: + raise ValueError(f"Unable to convert PR number to integer: '{path_parts[5]}'") from e + + return repo_name, pr_number + + @staticmethod + def _is_valid_codecommit_hostname(hostname: str) -> bool: + """ + Check if the provided hostname is a valid AWS CodeCommit hostname. + + This is not an exhaustive check of AWS region names, + but instead uses a regex to check for matching AWS region patterns. + + Args: + - hostname: the hostname to check + + Returns: + - bool: True if the hostname is valid, False otherwise. + """ + return re.match(r"^[a-z]{2}-(gov-)?[a-z]+-\d\.console\.aws\.amazon\.com$", hostname) is not None + + def _get_pr(self): + response = self.codecommit_client.get_pr(self.repo_name, self.pr_num) + + if len(response.targets) == 0: + raise ValueError(f"No files found in CodeCommit PR: {self.pr_num}") + + # TODO: implement support for multiple targets in one CodeCommit PR + # for now, we are only using the first target in the PR + if len(response.targets) > 1: + get_logger().warning( + "Multiple targets in one PR is not supported for CodeCommit yet. Continuing, using the first target only..." + ) + + # Return our object that mimics PullRequest class from the PyGithub library + # (This strategy was copied from the LocalGitProvider) + mimic = PullRequestCCMimic(response.title, self.diff_files) + mimic.description = response.description + mimic.source_commit = response.targets[0].source_commit + mimic.source_branch = response.targets[0].source_branch + mimic.destination_commit = response.targets[0].destination_commit + mimic.destination_branch = response.targets[0].destination_branch + + return mimic + + def get_commit_messages(self): + return "" # not implemented yet + + @staticmethod + def _add_additional_newlines(body: str) -> str: + """ + Replace single newlines in a PR body with double newlines. + + CodeCommit Markdown does not seem to render as well as GitHub Markdown, + so we add additional newlines to the PR body to make it more readable in CodeCommit. + + Args: + - body: the PR body + + Returns: + - str: the PR body with the double newlines added + """ + return re.sub(r'(? str: + """ + Remove the HTML tags from a PR comment. + + CodeCommit Markdown does not seem to render as well as GitHub Markdown, + so we remove the HTML tags from the PR comment to make it more readable in CodeCommit. + + Args: + - comment: the PR comment + + Returns: + - str: the PR comment with the HTML tags removed + """ + comment = comment.replace("
    ", "") + comment = comment.replace("
    ", "") + comment = comment.replace("", "") + comment = comment.replace("", "") + return comment + + @staticmethod + def _get_edit_type(codecommit_change_type: str): + """ + Convert the CodeCommit change type string to the EDIT_TYPE enum. + The CodeCommit change type string is returned from the get_differences SDK method. + + Args: + - codecommit_change_type: the CodeCommit change type string + + Returns: + - An EDIT_TYPE enum representing the modified, added, deleted, or renamed file in the PR diff. + """ + t = codecommit_change_type.upper() + edit_type = None + if t == "A": + edit_type = EDIT_TYPE.ADDED + elif t == "D": + edit_type = EDIT_TYPE.DELETED + elif t == "M": + edit_type = EDIT_TYPE.MODIFIED + elif t == "R": + edit_type = EDIT_TYPE.RENAMED + return edit_type + + @staticmethod + def _get_file_extensions(filenames): + """ + Return a list of file extensions from a list of filenames. + The returned extensions will include the dot "." prefix, + to accommodate for the dots in the existing language_extension_map settings. + Filenames with no extension will return an empty string for the extension. + + Args: + - filenames: a list of filenames + + Returns: + - list: A list of file extensions, including the dot "." prefix. + """ + extensions = [] + for filename in filenames: + filename, ext = os.path.splitext(filename) + if ext: + extensions.append(ext.lower()) + else: + extensions.append("") + return extensions + + @staticmethod + def _get_language_percentages(extensions): + """ + Return a dictionary containing the programming language name (as the key), + and the percentage that language is used (as the value), + given a list of file extensions. + + Args: + - extensions: a list of file extensions + + Returns: + - dict: A dictionary where each key is a language name and the corresponding value is the percentage of that language in the PR. + """ + total_files = len(extensions) + if total_files == 0: + return {} + + # Identify language by file extension and count + lang_count = Counter(extensions) + # Convert counts to percentages + lang_percentage = { + lang: round(count / total_files * 100) for lang, count in lang_count.items() + } + return lang_percentage diff --git a/pr_agent/git_providers/gerrit_provider.py b/pr_agent/git_providers/gerrit_provider.py new file mode 100644 index 000000000..8ec1be135 --- /dev/null +++ b/pr_agent/git_providers/gerrit_provider.py @@ -0,0 +1,399 @@ +import json +import os +import pathlib +import shutil +import subprocess +import uuid +from collections import Counter, namedtuple +from pathlib import Path +from tempfile import NamedTemporaryFile, mkdtemp + +import requests +import urllib3.util +from git import Repo + +from pr_agent.config_loader import get_settings +from pr_agent.git_providers.git_provider import GitProvider +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo +from pr_agent.git_providers.local_git_provider import PullRequestMimic +from pr_agent.log import get_logger + + +def _call(*command, **kwargs) -> (int, str, str): + res = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + **kwargs, + ) + return res.stdout.decode() + + +def clone(url, directory): + get_logger().info("Cloning %s to %s", url, directory) + stdout = _call('git', 'clone', "--depth", "1", url, directory) + get_logger().info(stdout) + + +def fetch(url, refspec, cwd): + get_logger().info("Fetching %s %s", url, refspec) + stdout = _call( + 'git', 'fetch', '--depth', '2', url, refspec, + cwd=cwd + ) + get_logger().info(stdout) + + +def checkout(cwd): + get_logger().info("Checking out") + stdout = _call('git', 'checkout', "FETCH_HEAD", cwd=cwd) + get_logger().info(stdout) + + +def show(*args, cwd=None): + get_logger().info("Show") + return _call('git', 'show', *args, cwd=cwd) + + +def diff(*args, cwd=None): + get_logger().info("Diff") + patch = _call('git', 'diff', *args, cwd=cwd) + if not patch: + get_logger().warning("No changes found") + return + return patch + + +def reset_local_changes(cwd): + get_logger().info("Reset local changes") + _call('git', 'checkout', "--force", cwd=cwd) + + +def add_comment(url: urllib3.util.Url, refspec, message): + *_, patchset, changenum = refspec.rsplit("/") + message = "'" + message.replace("'", "'\"'\"'") + "'" + return _call( + "ssh", + "-p", str(url.port), + f"{url.auth}@{url.host}", + "gerrit", "review", + "--message", message, + # "--code-review", score, + f"{patchset},{changenum}", + ) + + +def list_comments(url: urllib3.util.Url, refspec): + *_, patchset, _ = refspec.rsplit("/") + stdout = _call( + "ssh", + "-p", str(url.port), + f"{url.auth}@{url.host}", + "gerrit", "query", + "--comments", + "--current-patch-set", patchset, + "--format", "JSON", + ) + change_set, *_ = stdout.splitlines() + return json.loads(change_set)["currentPatchSet"]["comments"] + + +def prepare_repo(url: urllib3.util.Url, project, refspec): + repo_url = (f"{url.scheme}://{url.auth}@{url.host}:{url.port}/{project}") + + directory = pathlib.Path(mkdtemp()) + clone(repo_url, directory), + fetch(repo_url, refspec, cwd=directory) + checkout(cwd=directory) + return directory + + +def adopt_to_gerrit_message(message): + lines = message.splitlines() + buf = [] + for line in lines: + # remove markdown formatting + line = (line.replace("*", "") + .replace("``", "`") + .replace("
    ", "") + .replace("
    ", "") + .replace("", "") + .replace("", "")) + + line = line.strip() + if line.startswith('#'): + buf.append("\n" + + line.replace('#', '').removesuffix(":").strip() + + ":") + continue + elif line.startswith('-'): + buf.append(line.removeprefix('-').strip()) + continue + else: + buf.append(line) + return "\n".join(buf).strip() + + +def add_suggestion(src_filename, context: str, start, end: int): + with ( + NamedTemporaryFile("w", delete=False) as tmp, + open(src_filename, "r") as src + ): + lines = src.readlines() + tmp.writelines(lines[:start - 1]) + if context: + tmp.write(context) + tmp.writelines(lines[end:]) + + shutil.copy(tmp.name, src_filename) + os.remove(tmp.name) + + +def upload_patch(patch, path): + patch_server_endpoint = get_settings().get( + 'gerrit.patch_server_endpoint') + patch_server_token = get_settings().get( + 'gerrit.patch_server_token') + + response = requests.post( + patch_server_endpoint, + json={ + "content": patch, + "path": path, + }, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {patch_server_token}", + } + ) + response.raise_for_status() + patch_server_endpoint = patch_server_endpoint.rstrip("/") + return patch_server_endpoint + "/" + path + + +class GerritProvider(GitProvider): + + def __init__(self, key: str, incremental=False): + self.project, self.refspec = key.split(':') + assert self.project, "Project name is required" + assert self.refspec, "Refspec is required" + base_url = get_settings().get('gerrit.url') + assert base_url, "Gerrit URL is required" + user = get_settings().get('gerrit.user') + assert user, "Gerrit user is required" + + parsed = urllib3.util.parse_url(base_url) + self.parsed_url = urllib3.util.parse_url( + f"{parsed.scheme}://{user}@{parsed.host}:{parsed.port}" + ) + + self.repo_path = prepare_repo( + self.parsed_url, self.project, self.refspec + ) + self.repo = Repo(self.repo_path) + assert self.repo + self.pr_url = base_url + self.pr = PullRequestMimic(self.get_pr_title(), self.get_diff_files()) + + def get_pr_title(self): + """ + Substitutes the branch-name as the PR-mimic title. + """ + return self.repo.branches[0].name + + def get_issue_comments(self): + comments = list_comments(self.parsed_url, self.refspec) + Comments = namedtuple('Comments', ['reversed']) + Comment = namedtuple('Comment', ['body']) + return Comments([Comment(c['message']) for c in reversed(comments)]) + + def get_pr_labels(self, update=False): + raise NotImplementedError( + 'Getting labels is not implemented for the gerrit provider') + + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False): + raise NotImplementedError( + 'Adding reactions is not implemented for the gerrit provider') + + def remove_reaction(self, issue_comment_id: int, reaction_id: int): + raise NotImplementedError( + 'Removing reactions is not implemented for the gerrit provider') + + def get_commit_messages(self): + return [self.repo.head.commit.message] + + def get_repo_settings(self): + try: + with open(self.repo_path / ".pr_agent.toml", 'rb') as f: + contents = f.read() + return contents + except OSError: + return b"" + + def get_diff_files(self) -> list[FilePatchInfo]: + diffs = self.repo.head.commit.diff( + self.repo.head.commit.parents[0], # previous commit + create_patch=True, + R=True + ) + + diff_files = [] + for diff_item in diffs: + if diff_item.a_blob is not None: + original_file_content_str = ( + diff_item.a_blob.data_stream.read().decode('utf-8') + ) + else: + original_file_content_str = "" # empty file + if diff_item.b_blob is not None: + new_file_content_str = diff_item.b_blob.data_stream.read(). \ + decode('utf-8') + else: + new_file_content_str = "" # empty file + edit_type = EDIT_TYPE.MODIFIED + if diff_item.new_file: + edit_type = EDIT_TYPE.ADDED + elif diff_item.deleted_file: + edit_type = EDIT_TYPE.DELETED + elif diff_item.renamed_file: + edit_type = EDIT_TYPE.RENAMED + diff_files.append( + FilePatchInfo( + original_file_content_str, + new_file_content_str, + diff_item.diff.decode('utf-8'), + diff_item.b_path, + edit_type=edit_type, + old_filename=None + if diff_item.a_path == diff_item.b_path + else diff_item.a_path + ) + ) + self.diff_files = diff_files + return diff_files + + def get_files(self): + diff_index = self.repo.head.commit.diff( + self.repo.head.commit.parents[0], # previous commit + R=True + ) + # Get the list of changed files + diff_files = [item.a_path for item in diff_index] + return diff_files + + def get_languages(self): + """ + Calculate percentage of languages in repository. Used for hunk + prioritisation. + """ + # Get all files in repository + filepaths = [Path(item.path) for item in + self.repo.tree().traverse() if item.type == 'blob'] + # Identify language by file extension and count + lang_count = Counter( + ext.lstrip('.') for filepath in filepaths for ext in + [filepath.suffix.lower()]) + # Convert counts to percentages + total_files = len(filepaths) + lang_percentage = {lang: count / total_files * 100 for lang, count + in lang_count.items()} + return lang_percentage + + def get_pr_description_full(self): + return self.repo.head.commit.message + + def get_user_id(self): + return self.repo.head.commit.author.email + + def is_supported(self, capability: str) -> bool: + if capability in [ + # 'get_issue_comments', + 'create_inline_comment', + 'publish_inline_comments', + 'get_labels', + 'gfm_markdown' + ]: + return False + return True + + def split_suggestion(self, msg) -> tuple[str, str]: + is_code_context = False + description = [] + context = [] + for line in msg.splitlines(): + if line.startswith('```suggestion'): + is_code_context = True + continue + if line.startswith('```'): + is_code_context = False + continue + if is_code_context: + context.append(line) + else: + description.append( + line.replace('*', '') + ) + + return ( + '\n'.join(description), + '\n'.join(context) + '\n' if context else '' + ) + + def publish_code_suggestions(self, code_suggestions: list): + msg = [] + for suggestion in code_suggestions: + description, code = self.split_suggestion(suggestion['body']) + add_suggestion( + pathlib.Path(self.repo_path) / suggestion["relevant_file"], + code, + suggestion["relevant_lines_start"], + suggestion["relevant_lines_end"], + ) + patch = diff(cwd=self.repo_path) + patch_id = uuid.uuid4().hex[0:4] + path = "/".join(["codium-ai", self.refspec, patch_id]) + full_path = upload_patch(patch, path) + reset_local_changes(self.repo_path) + msg.append(f'* {description}\n{full_path}') + + if msg: + add_comment(self.parsed_url, self.refspec, "\n".join(msg)) + return True + + def publish_comment(self, pr_comment: str, is_temporary: bool = False): + if not is_temporary: + msg = adopt_to_gerrit_message(pr_comment) + add_comment(self.parsed_url, self.refspec, msg) + + def publish_description(self, pr_title: str, pr_body: str): + msg = adopt_to_gerrit_message(pr_body) + add_comment(self.parsed_url, self.refspec, pr_title + '\n' + msg) + + def publish_inline_comments(self, comments: list[dict]): + raise NotImplementedError( + 'Publishing inline comments is not implemented for the gerrit ' + 'provider') + + def publish_inline_comment(self, body: str, relevant_file: str, + relevant_line_in_file: str, original_suggestion=None): + raise NotImplementedError( + 'Publishing inline comments is not implemented for the gerrit ' + 'provider') + + + def publish_labels(self, labels): + # Not applicable to the local git provider, + # but required by the interface + pass + + def remove_initial_comment(self): + # remove repo, cloned in previous steps + # shutil.rmtree(self.repo_path) + pass + + def remove_comment(self, comment): + pass + + def get_pr_branch(self): + return self.repo.head diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py index 2a891938c..265e54d9c 100644 --- a/pr_agent/git_providers/git_provider.py +++ b/pr_agent/git_providers/git_provider.py @@ -1,151 +1,350 @@ from abc import ABC, abstractmethod -from dataclasses import dataclass # enum EDIT_TYPE (ADDED, DELETED, MODIFIED, RENAMED) -from enum import Enum from typing import Optional - -class EDIT_TYPE(Enum): - ADDED = 1 - DELETED = 2 - MODIFIED = 3 - RENAMED = 4 - - -@dataclass -class FilePatchInfo: - base_file: str - head_file: str - patch: str - filename: str - tokens: int = -1 - edit_type: EDIT_TYPE = EDIT_TYPE.MODIFIED - old_filename: str = None - +from pr_agent.algo.utils import Range, process_description +from pr_agent.config_loader import get_settings +from pr_agent.algo.types import FilePatchInfo +from pr_agent.log import get_logger +MAX_FILES_ALLOWED_FULL = 50 class GitProvider(ABC): @abstractmethod def is_supported(self, capability: str) -> bool: pass + @abstractmethod + def get_files(self) -> list: + pass + @abstractmethod def get_diff_files(self) -> list[FilePatchInfo]: pass + def get_incremental_commits(self, is_incremental): + pass + @abstractmethod def publish_description(self, pr_title: str, pr_body: str): pass @abstractmethod - def publish_comment(self, pr_comment: str, is_temporary: bool = False): + def publish_code_suggestions(self, code_suggestions: list) -> bool: pass @abstractmethod - def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + def get_languages(self): pass @abstractmethod - def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + def get_pr_branch(self): pass @abstractmethod - def publish_inline_comments(self, comments: list[dict]): + def get_user_id(self): pass @abstractmethod - def publish_code_suggestions(self, code_suggestions: list): + def get_pr_description_full(self) -> str: pass - @abstractmethod - def publish_labels(self, labels): + def edit_comment(self, comment, body: str): + pass + + def edit_comment_from_comment_id(self, comment_id: int, body: str): + pass + + def get_comment_body_from_comment_id(self, comment_id: int) -> str: + pass + + def reply_to_comment_from_comment_id(self, comment_id: int, body: str): pass + def get_pr_description(self, full: bool = True, split_changes_walkthrough=False) -> str or tuple: + from pr_agent.config_loader import get_settings + from pr_agent.algo.utils import clip_tokens + max_tokens_description = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) + description = self.get_pr_description_full() if full else self.get_user_description() + if split_changes_walkthrough: + description, files = process_description(description) + if max_tokens_description: + description = clip_tokens(description, max_tokens_description) + return description, files + else: + if max_tokens_description: + description = clip_tokens(description, max_tokens_description) + return description + + def get_user_description(self) -> str: + if hasattr(self, 'user_description') and not (self.user_description is None): + return self.user_description + + description = (self.get_pr_description_full() or "").strip() + description_lowercase = description.lower() + get_logger().debug(f"Existing description", description=description_lowercase) + + # if the existing description wasn't generated by the pr-agent, just return it as-is + if not self._is_generated_by_pr_agent(description_lowercase): + get_logger().info(f"Existing description was not generated by the pr-agent") + self.user_description = description + return description + + # if the existing description was generated by the pr-agent, but it doesn't contain a user description, + # return nothing (empty string) because it means there is no user description + user_description_header = "### **user description**" + if user_description_header not in description_lowercase: + get_logger().info(f"Existing description was generated by the pr-agent, but it doesn't contain a user description") + return "" + + # otherwise, extract the original user description from the existing pr-agent description and return it + # user_description_start_position = description_lowercase.find(user_description_header) + len(user_description_header) + # return description[user_description_start_position:].split("\n", 1)[-1].strip() + + # the 'user description' is in the beginning. extract and return it + possible_headers = self._possible_headers() + start_position = description_lowercase.find(user_description_header) + len(user_description_header) + end_position = len(description) + for header in possible_headers: # try to clip at the next header + if header != user_description_header and header in description_lowercase: + end_position = min(end_position, description_lowercase.find(header)) + if end_position != len(description) and end_position > start_position: + original_user_description = description[start_position:end_position].strip() + if original_user_description.endswith("___"): + original_user_description = original_user_description[:-3].strip() + else: + original_user_description = description.split("___")[0].strip() + if original_user_description.lower().startswith(user_description_header): + original_user_description = original_user_description[len(user_description_header):].strip() + + get_logger().info(f"Extracted user description from existing description", + description=original_user_description) + self.user_description = original_user_description + return original_user_description + + def _possible_headers(self): + return ("### **user description**", "### **pr type**", "### **pr description**", "### **pr labels**", "### **type**", "### **description**", + "### **labels**", "### ๐Ÿค– generated by pr agent") + + def _is_generated_by_pr_agent(self, description_lowercase: str) -> bool: + possible_headers = self._possible_headers() + return any(description_lowercase.startswith(header) for header in possible_headers) + @abstractmethod - def get_labels(self): + def get_repo_settings(self): pass + def get_workspace_name(self): + return "" + + def get_pr_id(self): + return "" + + def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: + return "" + + def get_lines_link_original_file(self, filepath:str, component_range: Range) -> str: + return "" + + #### comments operations #### @abstractmethod - def remove_initial_comment(self): + def publish_comment(self, pr_comment: str, is_temporary: bool = False): pass + def publish_persistent_comment(self, pr_comment: str, + initial_header: str, + update_header: bool = True, + name='review', + final_update_message=True): + self.publish_comment(pr_comment) + + def publish_persistent_comment_full(self, pr_comment: str, + initial_header: str, + update_header: bool = True, + name='review', + final_update_message=True): + try: + prev_comments = list(self.get_issue_comments()) + for comment in prev_comments: + if comment.body.startswith(initial_header): + latest_commit_url = self.get_latest_commit_url() + comment_url = self.get_comment_url(comment) + if update_header: + updated_header = f"{initial_header}\n\n#### ({name.capitalize()} updated until commit {latest_commit_url})\n" + pr_comment_updated = pr_comment.replace(initial_header, updated_header) + else: + pr_comment_updated = pr_comment + get_logger().info(f"Persistent mode - updating comment {comment_url} to latest {name} message") + # response = self.mr.notes.update(comment.id, {'body': pr_comment_updated}) + self.edit_comment(comment, pr_comment_updated) + if final_update_message: + self.publish_comment( + f"**[Persistent {name}]({comment_url})** updated to latest commit {latest_commit_url}") + return + except Exception as e: + get_logger().exception(f"Failed to update persistent review, error: {e}") + pass + self.publish_comment(pr_comment) + + @abstractmethod - def get_languages(self): + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): pass + def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, + absolute_position: int = None): + raise NotImplementedError("This git provider does not support creating inline comments yet") + @abstractmethod - def get_pr_branch(self): + def publish_inline_comments(self, comments: list[dict]): pass @abstractmethod - def get_user_id(self): + def remove_initial_comment(self): pass @abstractmethod - def get_pr_description(self): + def remove_comment(self, comment): pass @abstractmethod def get_issue_comments(self): pass + def get_comment_url(self, comment) -> str: + return "" + + #### labels operations #### @abstractmethod - def add_eyes_reaction(self, issue_comment_id: int) -> Optional[int]: + def publish_labels(self, labels): + pass + + @abstractmethod + def get_pr_labels(self, update=False): + pass + + def get_repo_labels(self): + pass + + @abstractmethod + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: pass @abstractmethod def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: pass + #### commits operations #### @abstractmethod def get_commit_messages(self): pass + def get_pr_url(self) -> str: + if hasattr(self, 'pr_url'): + return self.pr_url + return "" + + def get_latest_commit_url(self) -> str: + return "" + + def auto_approve(self) -> bool: + return False + + def calc_pr_statistics(self, pull_request_data: dict): + return {} + + def get_num_of_files(self): + try: + return len(self.get_diff_files()) + except Exception as e: + return -1 + + def limit_output_characters(self, output: str, max_chars: int): + return output[:max_chars] + '...' if len(output) > max_chars else output + + def get_main_pr_language(languages, files) -> str: """ Get the main language of the commit. Return an empty string if cannot determine. """ main_language_str = "" + if not languages: + get_logger().info("No languages detected") + return main_language_str + if not files: + get_logger().info("No files in diff") + return main_language_str + try: top_language = max(languages, key=languages.get).lower() # validate that the specific commit uses the main language extension_list = [] for file in files: + if not file: + continue + if isinstance(file, str): + file = FilePatchInfo(base_file=None, head_file=None, patch=None, filename=file) extension_list.append(file.filename.rsplit('.')[-1]) # get the most common extension - most_common_extension = max(set(extension_list), key=extension_list.count) - - # look for a match. TBD: add more languages, do this systematically - if most_common_extension == 'py' and top_language == 'python' or \ - most_common_extension == 'js' and top_language == 'javascript' or \ - most_common_extension == 'ts' and top_language == 'typescript' or \ - most_common_extension == 'go' and top_language == 'go' or \ - most_common_extension == 'java' and top_language == 'java' or \ - most_common_extension == 'c' and top_language == 'c' or \ - most_common_extension == 'cpp' and top_language == 'c++' or \ - most_common_extension == 'cs' and top_language == 'c#' or \ - most_common_extension == 'swift' and top_language == 'swift' or \ - most_common_extension == 'php' and top_language == 'php' or \ - most_common_extension == 'rb' and top_language == 'ruby' or \ - most_common_extension == 'rs' and top_language == 'rust' or \ - most_common_extension == 'scala' and top_language == 'scala' or \ - most_common_extension == 'kt' and top_language == 'kotlin' or \ - most_common_extension == 'pl' and top_language == 'perl' or \ - most_common_extension == 'swift' and top_language == 'swift': - main_language_str = top_language - - except Exception: + most_common_extension = '.' + max(set(extension_list), key=extension_list.count) + try: + language_extension_map_org = get_settings().language_extension_map_org + language_extension_map = {k.lower(): v for k, v in language_extension_map_org.items()} + + if top_language in language_extension_map and most_common_extension in language_extension_map[top_language]: + main_language_str = top_language + else: + for language, extensions in language_extension_map.items(): + if most_common_extension in extensions: + main_language_str = language + break + except Exception as e: + get_logger().exception(f"Failed to get main language: {e}") + pass + + ## old approach: + # most_common_extension = max(set(extension_list), key=extension_list.count) + # if most_common_extension == 'py' and top_language == 'python' or \ + # most_common_extension == 'js' and top_language == 'javascript' or \ + # most_common_extension == 'ts' and top_language == 'typescript' or \ + # most_common_extension == 'tsx' and top_language == 'typescript' or \ + # most_common_extension == 'go' and top_language == 'go' or \ + # most_common_extension == 'java' and top_language == 'java' or \ + # most_common_extension == 'c' and top_language == 'c' or \ + # most_common_extension == 'cpp' and top_language == 'c++' or \ + # most_common_extension == 'cs' and top_language == 'c#' or \ + # most_common_extension == 'swift' and top_language == 'swift' or \ + # most_common_extension == 'php' and top_language == 'php' or \ + # most_common_extension == 'rb' and top_language == 'ruby' or \ + # most_common_extension == 'rs' and top_language == 'rust' or \ + # most_common_extension == 'scala' and top_language == 'scala' or \ + # most_common_extension == 'kt' and top_language == 'kotlin' or \ + # most_common_extension == 'pl' and top_language == 'perl' or \ + # most_common_extension == top_language: + # main_language_str = top_language + + except Exception as e: + get_logger().exception(e) pass return main_language_str + + class IncrementalPR: def __init__(self, is_incremental: bool = False): self.is_incremental = is_incremental self.commits_range = None - self.first_new_commit_sha = None - self.last_seen_commit_sha = None + self.first_new_commit = None + self.last_seen_commit = None + + @property + def first_new_commit_sha(self): + return None if self.first_new_commit is None else self.first_new_commit.sha + @property + def last_seen_commit_sha(self): + return None if self.last_seen_commit is None else self.last_seen_commit.sha diff --git a/pr_agent/git_providers/github_provider.py b/pr_agent/git_providers/github_provider.py index be0fa645d..01f8aff12 100644 --- a/pr_agent/git_providers/github_provider.py +++ b/pr_agent/git_providers/github_provider.py @@ -1,29 +1,36 @@ -import logging +import itertools +import time import hashlib - from datetime import datetime -from typing import Optional, Tuple, Any +from typing import Optional, Tuple from urllib.parse import urlparse -from github import AppAuthentication, Auth, Github, GithubException, Reaction +from github import AppAuthentication, Auth, Github, GithubException from retry import retry from starlette_context import context -from .git_provider import FilePatchInfo, GitProvider, IncrementalPR +from ..algo.file_filter import filter_ignored from ..algo.language_handler import is_valid_file -from ..algo.utils import load_large_diff -from ..algo.pr_processing import find_line_number_of_relevant_line_in_file, clip_tokens +from ..algo.utils import PRReviewHeader, load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file, Range from ..config_loader import get_settings +from ..log import get_logger from ..servers.utils import RateLimitExceeded +from .git_provider import GitProvider, IncrementalPR, MAX_FILES_ALLOWED_FULL +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo class GithubProvider(GitProvider): - def __init__(self, pr_url: Optional[str] = None, incremental=IncrementalPR(False)): + def __init__(self, pr_url: Optional[str] = None): self.repo_obj = None try: self.installation_id = context.get("installation_id", None) except Exception: self.installation_id = None + self.max_comment_chars = 65000 + self.base_url = get_settings().get("GITHUB.BASE_URL", "https://api.github.com").rstrip("/") + self.base_url_html = self.base_url.split("api/")[0].rstrip("/") if "api/" in self.base_url else "https://github.com" + self.base_domain = self.base_url.replace("https://", "").replace("http://", "") + self.base_domain_html = self.base_url_html.replace("https://", "").replace("http://", "") self.github_client = self._get_github_client() self.repo = None self.pr_num = None @@ -31,64 +38,98 @@ def __init__(self, pr_url: Optional[str] = None, incremental=IncrementalPR(False self.github_user_id = None self.diff_files = None self.git_files = None - self.incremental = incremental - if pr_url: + self.incremental = IncrementalPR(False) + if pr_url and 'pull' in pr_url: self.set_pr(pr_url) - self.last_commit_id = list(self.pr.get_commits())[-1] + self.pr_commits = list(self.pr.get_commits()) + self.last_commit_id = self.pr_commits[-1] + self.pr_url = self.get_pr_url() # pr_url for github actions can be as api.github.com, so we need to get the url from the pr object + else: + self.pr_commits = None + + def get_incremental_commits(self, incremental=IncrementalPR(False)): + self.incremental = incremental + if self.incremental.is_incremental: + self.unreviewed_files_set = dict() + self._get_incremental_commits() def is_supported(self, capability: str) -> bool: return True def get_pr_url(self) -> str: - return f"https://github.com/{self.repo}/pull/{self.pr_num}" + return self.pr.html_url def set_pr(self, pr_url: str): self.repo, self.pr_num = self._parse_pr_url(pr_url) self.pr = self._get_pr() - if self.incremental.is_incremental: - self.get_incremental_commits() - def get_incremental_commits(self): - self.commits = list(self.pr.get_commits()) + def _get_incremental_commits(self): + if not self.pr_commits: + self.pr_commits = list(self.pr.get_commits()) - self.get_previous_review() + self.previous_review = self.get_previous_review(full=True, incremental=True) if self.previous_review: self.incremental.commits_range = self.get_commit_range() # Get all files changed during the commit range - self.file_set = dict() + for commit in self.incremental.commits_range: if commit.commit.message.startswith(f"Merge branch '{self._get_repo().default_branch}'"): - logging.info(f"Skipping merge commit {commit.commit.message}") + get_logger().info(f"Skipping merge commit {commit.commit.message}") continue - self.file_set.update({file.filename: file for file in commit.files}) + self.unreviewed_files_set.update({file.filename: file for file in commit.files}) + else: + get_logger().info("No previous review found, will review the entire PR") + self.incremental.is_incremental = False def get_commit_range(self): last_review_time = self.previous_review.created_at - first_new_commit_index = 0 - for index in range(len(self.commits) - 1, -1, -1): - if self.commits[index].commit.author.date > last_review_time: - self.incremental.first_new_commit_sha = self.commits[index].sha + first_new_commit_index = None + for index in range(len(self.pr_commits) - 1, -1, -1): + if self.pr_commits[index].commit.author.date > last_review_time: + self.incremental.first_new_commit = self.pr_commits[index] first_new_commit_index = index else: - self.incremental.last_seen_commit_sha = self.commits[index].sha + self.incremental.last_seen_commit = self.pr_commits[index] break - return self.commits[first_new_commit_index:] - - def get_previous_review(self): - self.previous_review = None - self.comments = list(self.pr.get_issue_comments()) + return self.pr_commits[first_new_commit_index:] if first_new_commit_index is not None else [] + + def get_previous_review(self, *, full: bool, incremental: bool): + if not (full or incremental): + raise ValueError("At least one of full or incremental must be True") + if not getattr(self, "comments", None): + self.comments = list(self.pr.get_issue_comments()) + prefixes = [] + if full: + prefixes.append(PRReviewHeader.REGULAR.value) + if incremental: + prefixes.append(PRReviewHeader.INCREMENTAL.value) for index in range(len(self.comments) - 1, -1, -1): - if self.comments[index].body.startswith("## PR Analysis"): - self.previous_review = self.comments[index] - break + if any(self.comments[index].body.startswith(prefix) for prefix in prefixes): + return self.comments[index] def get_files(self): - if self.incremental.is_incremental and self.file_set: - return self.file_set.values() - if not self.git_files: - # bring files from GitHub only once - self.git_files = self.pr.get_files() - return self.git_files + if self.incremental.is_incremental and self.unreviewed_files_set: + return self.unreviewed_files_set.values() + try: + git_files = context.get("git_files", None) + if git_files: + return git_files + self.git_files = list(self.pr.get_files()) # 'list' to handle pagination + context["git_files"] = self.git_files + return self.git_files + except Exception: + if not self.git_files: + self.git_files = list(self.pr.get_files()) + return self.git_files + + def get_num_of_files(self): + if hasattr(self.git_files, "totalCount"): + return self.git_files.totalCount + else: + try: + return len(self.git_files) + except Exception as e: + return -1 @retry(exceptions=RateLimitExceeded, tries=get_settings().github.ratelimit_retries, delay=2, backoff=2, jitter=(1, 3)) @@ -102,44 +143,121 @@ def get_diff_files(self) -> list[FilePatchInfo]: or renamed files in the merge request. """ try: + try: + diff_files = context.get("diff_files", None) + if diff_files: + return diff_files + except Exception: + pass + if self.diff_files: return self.diff_files - files = self.get_files() - diff_files = [] + # filter files using [ignore] patterns + files_original = self.get_files() + files = filter_ignored(files_original) + if files_original != files: + try: + names_original = [file.filename for file in files_original] + names_new = [file.filename for file in files] + get_logger().info(f"Filtered out [ignore] files for pull request:", extra= + {"files": names_original, + "filtered_files": names_new}) + except Exception: + pass + diff_files = [] + invalid_files_names = [] + counter_valid = 0 for file in files: if not is_valid_file(file.filename): + invalid_files_names.append(file.filename) continue - new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha) # communication with GitHub patch = file.patch - if self.incremental.is_incremental and self.file_set: + # allow only a limited number of files to be fully loaded. We can manage the rest with diffs only + counter_valid += 1 + avoid_load = False + if counter_valid >= MAX_FILES_ALLOWED_FULL and patch and not self.incremental.is_incremental: + avoid_load = True + if counter_valid == MAX_FILES_ALLOWED_FULL: + get_logger().info(f"Too many files in PR, will avoid loading full content for rest of files") + + if avoid_load: + new_file_content_str = "" + else: + new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha) # communication with GitHub + + if self.incremental.is_incremental and self.unreviewed_files_set: original_file_content_str = self._get_pr_file_content(file, self.incremental.last_seen_commit_sha) patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str) - self.file_set[file.filename] = patch + self.unreviewed_files_set[file.filename] = patch else: - original_file_content_str = self._get_pr_file_content(file, self.pr.base.sha) + if avoid_load: + original_file_content_str = "" + else: + original_file_content_str = self._get_pr_file_content(file, self.pr.base.sha) if not patch: patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str) - diff_files.append(FilePatchInfo(original_file_content_str, new_file_content_str, patch, file.filename)) + if file.status == 'added': + edit_type = EDIT_TYPE.ADDED + elif file.status == 'removed': + edit_type = EDIT_TYPE.DELETED + elif file.status == 'renamed': + edit_type = EDIT_TYPE.RENAMED + elif file.status == 'modified': + edit_type = EDIT_TYPE.MODIFIED + else: + get_logger().error(f"Unknown edit type: {file.status}") + edit_type = EDIT_TYPE.UNKNOWN + + # count number of lines added and removed + patch_lines = patch.splitlines(keepends=True) + num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) + num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) + file_patch_canonical_structure = FilePatchInfo(original_file_content_str, new_file_content_str, patch, + file.filename, edit_type=edit_type, + num_plus_lines=num_plus_lines, + num_minus_lines=num_minus_lines,) + diff_files.append(file_patch_canonical_structure) + if invalid_files_names: + get_logger().info(f"Filtered out files with invalid extensions: {invalid_files_names}") self.diff_files = diff_files + try: + context["diff_files"] = diff_files + except Exception: + pass + return diff_files except GithubException.RateLimitExceededException as e: - logging.error(f"Rate limit exceeded for GitHub API. Original message: {e}") + get_logger().error(f"Rate limit exceeded for GitHub API. Original message: {e}") raise RateLimitExceeded("Rate limit exceeded for GitHub API.") from e def publish_description(self, pr_title: str, pr_body: str): self.pr.edit(title=pr_title, body=pr_body) + def get_latest_commit_url(self) -> str: + return self.last_commit_id.html_url + + def get_comment_url(self, comment) -> str: + return comment.html_url + + def publish_persistent_comment(self, pr_comment: str, + initial_header: str, + update_header: bool = True, + name='review', + final_update_message=True): + self.publish_persistent_comment_full(pr_comment, initial_header, update_header, name, final_update_message) + def publish_comment(self, pr_comment: str, is_temporary: bool = False): if is_temporary and not get_settings().config.publish_output_progress: - logging.debug(f"Skipping publish_comment for temporary comment: {pr_comment}") + get_logger().debug(f"Skipping publish_comment for temporary comment: {pr_comment}") return + pr_comment = self.limit_output_characters(pr_comment, self.max_comment_chars) response = self.pr.create_issue_comment(pr_comment) if hasattr(response, "user") and hasattr(response.user, "login"): self.github_user_id = response.user.login @@ -147,26 +265,139 @@ def publish_comment(self, pr_comment: str, is_temporary: bool = False): if not hasattr(self.pr, 'comments_list'): self.pr.comments_list = [] self.pr.comments_list.append(response) + return response - def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): + body = self.limit_output_characters(body, self.max_comment_chars) self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)]) - def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): - position, absolute_position = find_line_number_of_relevant_line_in_file(self.diff_files, relevant_file.strip('`'), relevant_line_in_file) + def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, + absolute_position: int = None): + body = self.limit_output_characters(body, self.max_comment_chars) + position, absolute_position = find_line_number_of_relevant_line_in_file(self.diff_files, + relevant_file.strip('`'), + relevant_line_in_file, + absolute_position) if position == -1: if get_settings().config.verbosity_level >= 2: - logging.info(f"Could not find position for {relevant_file} {relevant_line_in_file}") + get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") subject_type = "FILE" else: subject_type = "LINE" path = relevant_file.strip() return dict(body=body, path=path, position=position) if subject_type == "LINE" else {} - def publish_inline_comments(self, comments: list[dict]): - self.pr.create_review(commit=self.last_commit_id, comments=comments) + def publish_inline_comments(self, comments: list[dict], disable_fallback: bool = False): + try: + # publish all comments in a single message + self.pr.create_review(commit=self.last_commit_id, comments=comments) + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish inline comments") + + if (getattr(e, "status", None) == 422 + and get_settings().github.publish_inline_comments_fallback_with_verification and not disable_fallback): + pass # continue to try _publish_inline_comments_fallback_with_verification + else: + raise e # will end up with publishing the comments one by one + + try: + self._publish_inline_comments_fallback_with_verification(comments) + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish inline code comments fallback, error: {e}") + raise e + + def _publish_inline_comments_fallback_with_verification(self, comments: list[dict]): + """ + Check each inline comment separately against the GitHub API and discard of invalid comments, + then publish all the remaining valid comments in a single review. + For invalid comments, also try removing the suggestion part and posting the comment just on the first line. + """ + verified_comments, invalid_comments = self._verify_code_comments(comments) + + # publish as a group the verified comments + if verified_comments: + try: + self.pr.create_review(commit=self.last_commit_id, comments=verified_comments) + except: + pass + + # try to publish one by one the invalid comments as a one-line code comment + if invalid_comments and get_settings().github.try_fix_invalid_inline_comments: + fixed_comments_as_one_liner = self._try_fix_invalid_inline_comments( + [comment for comment, _ in invalid_comments]) + for comment in fixed_comments_as_one_liner: + try: + self.publish_inline_comments([comment], disable_fallback=True) + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Published invalid comment as a single line comment: {comment}") + except: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish invalid comment as a single line comment: {comment}") + + def _verify_code_comment(self, comment: dict): + is_verified = False + e = None + try: + # event ="" # By leaving this blank, you set the review action state to PENDING + input = dict(commit_id=self.last_commit_id.sha, comments=[comment]) + headers, data = self.pr._requester.requestJsonAndCheck( + "POST", f"{self.pr.url}/reviews", input=input) + pending_review_id = data["id"] + is_verified = True + except Exception as err: + is_verified = False + pending_review_id = None + e = err + if pending_review_id is not None: + try: + self.pr._requester.requestJsonAndCheck("DELETE", f"{self.pr.url}/reviews/{pending_review_id}") + except Exception: + pass + return is_verified, e + + def _verify_code_comments(self, comments: list[dict]) -> tuple[list[dict], list[tuple[dict, Exception]]]: + """Very each comment against the GitHub API and return 2 lists: 1 of verified and 1 of invalid comments""" + verified_comments = [] + invalid_comments = [] + for comment in comments: + time.sleep(1) # for avoiding secondary rate limit + is_verified, e = self._verify_code_comment(comment) + if is_verified: + verified_comments.append(comment) + else: + invalid_comments.append((comment, e)) + return verified_comments, invalid_comments - def publish_code_suggestions(self, code_suggestions: list): + def _try_fix_invalid_inline_comments(self, invalid_comments: list[dict]) -> list[dict]: + """ + Try fixing invalid comments by removing the suggestion part and setting the comment just on the first line. + Return only comments that have been modified in some way. + This is a best-effort attempt to fix invalid comments, and should be verified accordingly. + """ + import copy + fixed_comments = [] + for comment in invalid_comments: + try: + fixed_comment = copy.deepcopy(comment) # avoid modifying the original comment dict for later logging + if "```suggestion" in comment["body"]: + fixed_comment["body"] = comment["body"].split("```suggestion")[0] + if "start_line" in comment: + fixed_comment["line"] = comment["start_line"] + del fixed_comment["start_line"] + if "start_side" in comment: + fixed_comment["side"] = comment["start_side"] + del fixed_comment["start_side"] + if fixed_comment != comment: + fixed_comments.append(fixed_comment) + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to fix inline comment, error: {e}") + return fixed_comments + + def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ @@ -179,13 +410,13 @@ def publish_code_suggestions(self, code_suggestions: list): if not relevant_lines_start or relevant_lines_start == -1: if get_settings().config.verbosity_level >= 2: - logging.exception( + get_logger().exception( f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}") continue if relevant_lines_end < relevant_lines_start: if get_settings().config.verbosity_level >= 2: - logging.exception(f"Failed to publish code suggestion, " + get_logger().exception(f"Failed to publish code suggestion, " f"relevant_lines_end is {relevant_lines_end} and " f"relevant_lines_start is {relevant_lines_start}") continue @@ -208,20 +439,97 @@ def publish_code_suggestions(self, code_suggestions: list): post_parameters_list.append(post_parameters) try: - self.pr.create_review(commit=self.last_commit_id, comments=post_parameters_list) + self.publish_inline_comments(post_parameters_list) + return True + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().error(f"Failed to publish code suggestion, error: {e}") + return False + + def edit_comment(self, comment, body: str): + body = self.limit_output_characters(body, self.max_comment_chars) + comment.edit(body=body) + + def edit_comment_from_comment_id(self, comment_id: int, body: str): + try: + # self.pr.get_issue_comment(comment_id).edit(body) + body = self.limit_output_characters(body, self.max_comment_chars) + headers, data_patch = self.pr._requester.requestJsonAndCheck( + "PATCH", f"{self.base_url}/repos/{self.repo}/issues/comments/{comment_id}", + input={"body": body} + ) + except Exception as e: + get_logger().exception(f"Failed to edit comment, error: {e}") + + def reply_to_comment_from_comment_id(self, comment_id: int, body: str): + try: + # self.pr.get_issue_comment(comment_id).edit(body) + body = self.limit_output_characters(body, self.max_comment_chars) + headers, data_patch = self.pr._requester.requestJsonAndCheck( + "POST", f"{self.base_url}/repos/{self.repo}/pulls/{self.pr_num}/comments/{comment_id}/replies", + input={"body": body} + ) + except Exception as e: + get_logger().exception(f"Failed to reply comment, error: {e}") + + def get_comment_body_from_comment_id(self, comment_id: int): + try: + # self.pr.get_issue_comment(comment_id).edit(body) + headers, data_patch = self.pr._requester.requestJsonAndCheck( + "GET", f"{self.base_url}/repos/{self.repo}/issues/comments/{comment_id}" + ) + return data_patch.get("body","") + except Exception as e: + get_logger().exception(f"Failed to edit comment, error: {e}") + return None + + def publish_file_comments(self, file_comments: list) -> bool: + try: + headers, existing_comments = self.pr._requester.requestJsonAndCheck( + "GET", f"{self.pr.url}/comments" + ) + for comment in file_comments: + comment['commit_id'] = self.last_commit_id.sha + comment['body'] = self.limit_output_characters(comment['body'], self.max_comment_chars) + + found = False + for existing_comment in existing_comments: + comment['commit_id'] = self.last_commit_id.sha + our_app_name = get_settings().get("GITHUB.APP_NAME", "") + same_comment_creator = False + if self.deployment_type == 'app': + same_comment_creator = our_app_name.lower() in existing_comment['user']['login'].lower() + elif self.deployment_type == 'user': + same_comment_creator = self.github_user_id == existing_comment['user']['login'] + if existing_comment['subject_type'] == 'file' and comment['path'] == existing_comment['path'] and same_comment_creator: + headers, data_patch = self.pr._requester.requestJsonAndCheck( + "PATCH", f"{self.base_url}/repos/{self.repo}/pulls/comments/{existing_comment['id']}", input={"body":comment['body']} + ) + found = True + break + if not found: + headers, data_post = self.pr._requester.requestJsonAndCheck( + "POST", f"{self.pr.url}/comments", input=comment + ) return True except Exception as e: if get_settings().config.verbosity_level >= 2: - logging.error(f"Failed to publish code suggestion, error: {e}") + get_logger().error(f"Failed to publish diffview file summary, error: {e}") return False def remove_initial_comment(self): try: for comment in getattr(self.pr, 'comments_list', []): if comment.is_temporary: - comment.delete() + self.remove_comment(comment) except Exception as e: - logging.exception(f"Failed to remove initial comment, error: {e}") + get_logger().exception(f"Failed to remove initial comment, error: {e}") + + def remove_comment(self, comment): + try: + comment.delete() + except Exception as e: + get_logger().exception(f"Failed to remove comment, error: {e}") def get_title(self): return self.pr.title @@ -233,18 +541,21 @@ def get_languages(self): def get_pr_branch(self): return self.pr.head.ref - def get_pr_description(self): - max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) - if max_tokens: - return clip_tokens(self.pr.body, max_tokens) + def get_pr_owner_id(self) -> str | None: + if not self.repo: + return None + return self.repo.split('/')[0] + + def get_pr_description_full(self): return self.pr.body def get_user_id(self): if not self.github_user_id: try: - self.github_user_id = self.github_client.get_user().login + self.github_user_id = self.github_client.get_user().raw_data['login'] except Exception as e: - logging.exception(f"Failed to get user id, error: {e}") + self.github_user_id = "" + # logging.exception(f"Failed to get user id, error: {e}") return self.github_user_id def get_notifications(self, since: datetime): @@ -261,37 +572,47 @@ def get_issue_comments(self): def get_repo_settings(self): try: - contents = self.repo_obj.get_contents(".pr_agent.toml", ref=self.pr.head.sha).decoded_content + # contents = self.repo_obj.get_contents(".pr_agent.toml", ref=self.pr.head.sha).decoded_content + + # more logical to take 'pr_agent.toml' from the default branch + contents = self.repo_obj.get_contents(".pr_agent.toml").decoded_content return contents except Exception: return "" - def add_eyes_reaction(self, issue_comment_id: int) -> Optional[int]: + def get_workspace_name(self): + return self.repo.split('/')[0] + + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: + if disable_eyes: + return None try: - reaction = self.pr.get_issue_comment(issue_comment_id).create_reaction("eyes") - return reaction.id + headers, data_patch = self.pr._requester.requestJsonAndCheck( + "POST", f"{self.base_url}/repos/{self.repo}/issues/comments/{issue_comment_id}/reactions", + input={"content": "eyes"} + ) + return data_patch.get("id", None) except Exception as e: - logging.exception(f"Failed to add eyes reaction, error: {e}") + get_logger().warning(f"Failed to add eyes reaction, error: {e}") return None - def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: + def remove_reaction(self, issue_comment_id: int, reaction_id: str) -> bool: try: - self.pr.get_issue_comment(issue_comment_id).delete_reaction(reaction_id) + # self.pr.get_issue_comment(issue_comment_id).delete_reaction(reaction_id) + headers, data_patch = self.pr._requester.requestJsonAndCheck( + "DELETE", + f"{self.base_url}/repos/{self.repo}/issues/comments/{issue_comment_id}/reactions/{reaction_id}" + ) return True except Exception as e: - logging.exception(f"Failed to remove eyes reaction, error: {e}") + get_logger().exception(f"Failed to remove eyes reaction, error: {e}") return False - - @staticmethod - def _parse_pr_url(pr_url: str) -> Tuple[str, int]: + def _parse_pr_url(self, pr_url: str) -> Tuple[str, int]: parsed_url = urlparse(pr_url) - if 'github.com' not in parsed_url.netloc: - raise ValueError("The provided URL is not a valid GitHub URL") - path_parts = parsed_url.path.strip('/').split('/') - if 'api.github.com' in parsed_url.netloc: + if self.base_domain in parsed_url.netloc: if len(path_parts) < 5 or path_parts[3] != 'pulls': raise ValueError("The provided URL does not appear to be a GitHub PR URL") repo_name = '/'.join(path_parts[1:3]) @@ -312,6 +633,30 @@ def _parse_pr_url(pr_url: str) -> Tuple[str, int]: return repo_name, pr_number + def _parse_issue_url(self, issue_url: str) -> Tuple[str, int]: + parsed_url = urlparse(issue_url) + path_parts = parsed_url.path.strip('/').split('/') + if self.base_domain in parsed_url.netloc: + if len(path_parts) < 5 or path_parts[3] != 'issues': + raise ValueError("The provided URL does not appear to be a GitHub ISSUE URL") + repo_name = '/'.join(path_parts[1:3]) + try: + issue_number = int(path_parts[4]) + except ValueError as e: + raise ValueError("Unable to convert issue number to integer") from e + return repo_name, issue_number + + if len(path_parts) < 4 or path_parts[2] != 'issues': + raise ValueError("The provided URL does not appear to be a GitHub PR issue") + + repo_name = '/'.join(path_parts[:2]) + try: + issue_number = int(path_parts[3]) + except ValueError as e: + raise ValueError("Unable to convert issue number to integer") from e + + return repo_name, issue_number + def _get_github_client(self): deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user") @@ -325,7 +670,7 @@ def _get_github_client(self): raise ValueError("GitHub app installation ID is required when using GitHub app deployment") auth = AppAuthentication(app_id=app_id, private_key=private_key, installation_id=self.installation_id) - return Github(app_auth=auth) + return Github(app_auth=auth, base_url=self.base_url) if deployment_type == 'user': try: @@ -334,7 +679,7 @@ def _get_github_client(self): raise ValueError( "GitHub token is required when using user deployment. See: " "https://github.com/Codium-ai/pr-agent#method-2-run-from-source") from e - return Github(auth=Auth.Token(token)) + return Github(auth=Auth.Token(token), base_url=self.base_url) def _get_repo(self): if hasattr(self, 'repo_obj') and \ @@ -349,17 +694,40 @@ def _get_repo(self): def _get_pr(self): return self._get_repo().get_pull(self.pr_num) - def _get_pr_file_content(self, file: FilePatchInfo, sha: str) -> str: + def get_pr_file_content(self, file_path: str, branch: str) -> str: try: - file_content_str = str(self._get_repo().get_contents(file.filename, ref=sha).decoded_content.decode()) + file_content_str = str( + self._get_repo() + .get_contents(file_path, ref=branch) + .decoded_content.decode() + ) except Exception: file_content_str = "" return file_content_str + def create_or_update_pr_file( + self, file_path: str, branch: str, contents="", message="" + ) -> None: + try: + file_obj = self._get_repo().get_contents(file_path, ref=branch) + sha1=file_obj.sha + except Exception: + sha1="" + self.repo_obj.update_file( + path=file_path, + message=message, + content=contents, + sha=sha1, + branch=branch, + ) + + def _get_pr_file_content(self, file: FilePatchInfo, sha: str) -> str: + return self.get_pr_file_content(file.filename, sha) + def publish_labels(self, pr_types): try: label_color_map = {"Bug fix": "1d76db", "Tests": "e99695", "Bug fix with tests": "c5def5", - "Refactoring": "bfdadc", "Enhancement": "bfd4f2", "Documentation": "d4c5f9", + "Enhancement": "bfd4f2", "Documentation": "d4c5f9", "Other": "d1bcf9"} post_parameters = [] for p in pr_types: @@ -369,15 +737,26 @@ def publish_labels(self, pr_types): "PUT", f"{self.pr.issue_url}/labels", input=post_parameters ) except Exception as e: - logging.exception(f"Failed to publish labels, error: {e}") + get_logger().warning(f"Failed to publish labels, error: {e}") - def get_labels(self): + def get_pr_labels(self, update=False): try: - return [label.name for label in self.pr.labels] + if not update: + labels =self.pr.labels + return [label.name for label in labels] + else: # obtain the latest labels. Maybe they changed while the AI was running + headers, labels = self.pr._requester.requestJsonAndCheck( + "GET", f"{self.pr.issue_url}/labels") + return [label['name'] for label in labels] + except Exception as e: - logging.exception(f"Failed to get labels, error: {e}") + get_logger().exception(f"Failed to get labels, error: {e}") return [] + def get_repo_labels(self): + labels = self.repo_obj.get_labels() + return [label for label in itertools.islice(labels, 50)] + def get_commit_messages(self): """ Retrieves the commit messages of a pull request. @@ -398,8 +777,8 @@ def get_commit_messages(self): def generate_link_to_relevant_line_number(self, suggestion) -> str: try: - relevant_file = suggestion['relevant file'].strip('`').strip("'") - relevant_line_str = suggestion['relevant line'] + relevant_file = suggestion['relevant_file'].strip('`').strip("'").strip('\n') + relevant_line_str = suggestion['relevant_line'].strip('\n') if not relevant_line_str: return "" @@ -413,10 +792,67 @@ def generate_link_to_relevant_line_number(self, suggestion) -> str: # link to diff sha_file = hashlib.sha256(relevant_file.encode('utf-8')).hexdigest() - link = f"https://github.com/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{absolute_position}" + link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{absolute_position}" return link except Exception as e: if get_settings().config.verbosity_level >= 2: - logging.info(f"Failed adding line link, error: {e}") + get_logger().info(f"Failed adding line link, error: {e}") return "" + + def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: + sha_file = hashlib.sha256(relevant_file.encode('utf-8')).hexdigest() + if relevant_line_start == -1: + link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}" + elif relevant_line_end: + link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{relevant_line_start}-R{relevant_line_end}" + else: + link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{relevant_line_start}" + return link + + def get_lines_link_original_file(self, filepath: str, component_range: Range) -> str: + """ + Returns the link to the original file on GitHub that corresponds to the given filepath and component range. + + Args: + filepath (str): The path of the file. + component_range (Range): The range of lines that represent the component. + + Returns: + str: The link to the original file on GitHub. + + Example: + >>> filepath = "path/to/file.py" + >>> component_range = Range(line_start=10, line_end=20) + >>> link = get_lines_link_original_file(filepath, component_range) + >>> print(link) + "https://github.com/{repo}/blob/{commit_sha}/{filepath}/#L11-L21" + """ + line_start = component_range.line_start + 1 + line_end = component_range.line_end + 1 + # link = (f"https://github.com/{self.repo}/blob/{self.last_commit_id.sha}/{filepath}/" + # f"#L{line_start}-L{line_end}") + link = (f"{self.base_url_html}/{self.repo}/blob/{self.last_commit_id.sha}/{filepath}/" + f"#L{line_start}-L{line_end}") + + return link + + def get_pr_id(self): + try: + pr_id = f"{self.repo}/{self.pr_num}" + return pr_id + except: + return "" + + def auto_approve(self) -> bool: + try: + res = self.pr.create_review(event="APPROVE") + if res.state == "APPROVED": + return True + return False + except Exception as e: + get_logger().exception(f"Failed to auto-approve, error: {e}") + return False + + def calc_pr_statistics(self, pull_request_data: dict): + return {} diff --git a/pr_agent/git_providers/gitlab_provider.py b/pr_agent/git_providers/gitlab_provider.py index 73a3a2f92..443f41618 100644 --- a/pr_agent/git_providers/gitlab_provider.py +++ b/pr_agent/git_providers/gitlab_provider.py @@ -1,19 +1,24 @@ -import logging +import hashlib import re from typing import Optional, Tuple from urllib.parse import urlparse import gitlab +import requests from gitlab import GitlabGetError +from ..algo.file_filter import filter_ignored from ..algo.language_handler import is_valid_file -from ..algo.pr_processing import clip_tokens -from ..algo.utils import load_large_diff +from ..algo.utils import load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file from ..config_loader import get_settings -from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider +from .git_provider import GitProvider, MAX_FILES_ALLOWED_FULL +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo +from ..log import get_logger -logger = logging.getLogger() +class DiffNotFoundError(Exception): + """Raised when the diff for a merge request cannot be found.""" + pass class GitLabProvider(GitProvider): @@ -21,6 +26,7 @@ def __init__(self, merge_request_url: Optional[str] = None, incremental: Optiona gitlab_url = get_settings().get("GITLAB.URL", None) if not gitlab_url: raise ValueError("GitLab URL is not set in the config file") + self.gitlab_url = gitlab_url gitlab_access_token = get_settings().get("GITLAB.PERSONAL_ACCESS_TOKEN", None) if not gitlab_access_token: raise ValueError("GitLab personal access token is not set in the config file") @@ -28,19 +34,22 @@ def __init__(self, merge_request_url: Optional[str] = None, incremental: Optiona url=gitlab_url, oauth_token=gitlab_access_token ) + self.max_comment_chars = 65000 self.id_project = None self.id_mr = None self.mr = None self.diff_files = None self.git_files = None self.temp_comments = [] + self.pr_url = merge_request_url self._set_merge_request(merge_request_url) self.RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") self.incremental = incremental def is_supported(self, capability: str) -> bool: - if capability in ['get_issue_comments', 'create_inline_comment', 'publish_inline_comments']: + if capability in ['get_issue_comments', 'create_inline_comment', 'publish_inline_comments', + 'publish_file_comments']: # gfm_markdown is supported in gitlab ! return False return True @@ -55,11 +64,11 @@ def _set_merge_request(self, merge_request_url: str): try: self.last_diff = self.mr.diffs.list(get_all=True)[-1] except IndexError as e: - logger.error(f"Could not get diff for merge request {self.id_mr}") - raise ValueError(f"Could not get diff for merge request {self.id_mr}") from e + get_logger().error(f"Could not get diff for merge request {self.id_mr}") + raise DiffNotFoundError(f"Could not get diff for merge request {self.id_mr}") from e - def _get_pr_file_content(self, file_path: str, branch: str) -> str: + def get_pr_file_content(self, file_path: str, branch: str) -> str: try: return self.gl.projects.get(self.id_project).files.get(file_path, branch).decode() except GitlabGetError: @@ -80,47 +89,81 @@ def get_diff_files(self) -> list[FilePatchInfo]: if self.diff_files: return self.diff_files - diffs = self.mr.changes()['changes'] + # filter files using [ignore] patterns + diffs_original = self.mr.changes()['changes'] + diffs = filter_ignored(diffs_original, 'gitlab') + if diffs != diffs_original: + try: + names_original = [diff['new_path'] for diff in diffs_original] + names_filtered = [diff['new_path'] for diff in diffs] + get_logger().info(f"Filtered out [ignore] files for merge request {self.id_mr}", extra={ + 'original_files': names_original, + 'filtered_files': names_filtered + }) + except Exception as e: + pass + diff_files = [] + invalid_files_names = [] + counter_valid = 0 for diff in diffs: - if is_valid_file(diff['new_path']): - # original_file_content_str = self._get_pr_file_content(diff['old_path'], self.mr.target_branch) - # new_file_content_str = self._get_pr_file_content(diff['new_path'], self.mr.source_branch) - original_file_content_str = self._get_pr_file_content(diff['old_path'], self.mr.diff_refs['base_sha']) - new_file_content_str = self._get_pr_file_content(diff['new_path'], self.mr.diff_refs['head_sha']) + if not is_valid_file(diff['new_path']): + invalid_files_names.append(diff['new_path']) + continue + + # allow only a limited number of files to be fully loaded. We can manage the rest with diffs only + counter_valid += 1 + if counter_valid < MAX_FILES_ALLOWED_FULL or not diff['diff']: + original_file_content_str = self.get_pr_file_content(diff['old_path'], self.mr.diff_refs['base_sha']) + new_file_content_str = self.get_pr_file_content(diff['new_path'], self.mr.diff_refs['head_sha']) + else: + if counter_valid == MAX_FILES_ALLOWED_FULL: + get_logger().info(f"Too many files in PR, will avoid loading full content for rest of files") + original_file_content_str = '' + new_file_content_str = '' + + try: + if isinstance(original_file_content_str, bytes): + original_file_content_str = bytes.decode(original_file_content_str, 'utf-8') + if isinstance(new_file_content_str, bytes): + new_file_content_str = bytes.decode(new_file_content_str, 'utf-8') + except UnicodeDecodeError: + get_logger().warning( + f"Cannot decode file {diff['old_path']} or {diff['new_path']} in merge request {self.id_mr}") + + edit_type = EDIT_TYPE.MODIFIED + if diff['new_file']: + edit_type = EDIT_TYPE.ADDED + elif diff['deleted_file']: + edit_type = EDIT_TYPE.DELETED + elif diff['renamed_file']: + edit_type = EDIT_TYPE.RENAMED + + filename = diff['new_path'] + patch = diff['diff'] + if not patch: + patch = load_large_diff(filename, new_file_content_str, original_file_content_str) + + + # count number of lines added and removed + patch_lines = patch.splitlines(keepends=True) + num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) + num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) + diff_files.append( + FilePatchInfo(original_file_content_str, new_file_content_str, + patch=patch, + filename=filename, + edit_type=edit_type, + old_filename=None if diff['old_path'] == diff['new_path'] else diff['old_path'], + num_plus_lines=num_plus_lines, + num_minus_lines=num_minus_lines, )) + if invalid_files_names: + get_logger().info(f"Filtered out files with invalid extensions: {invalid_files_names}") - try: - if isinstance(original_file_content_str, bytes): - original_file_content_str = bytes.decode(original_file_content_str, 'utf-8') - if isinstance(new_file_content_str, bytes): - new_file_content_str = bytes.decode(new_file_content_str, 'utf-8') - except UnicodeDecodeError: - logging.warning( - f"Cannot decode file {diff['old_path']} or {diff['new_path']} in merge request {self.id_mr}") - - edit_type = EDIT_TYPE.MODIFIED - if diff['new_file']: - edit_type = EDIT_TYPE.ADDED - elif diff['deleted_file']: - edit_type = EDIT_TYPE.DELETED - elif diff['renamed_file']: - edit_type = EDIT_TYPE.RENAMED - - filename = diff['new_path'] - patch = diff['diff'] - if not patch: - patch = load_large_diff(filename, new_file_content_str, original_file_content_str) - - diff_files.append( - FilePatchInfo(original_file_content_str, new_file_content_str, - patch=patch, - filename=filename, - edit_type=edit_type, - old_filename=None if diff['old_path'] == diff['new_path'] else diff['old_path'])) self.diff_files = diff_files return diff_files - def get_files(self): + def get_files(self) -> list: if not self.git_files: self.git_files = [change['new_path'] for change in self.mr.changes()['changes']] return self.git_files @@ -131,35 +174,76 @@ def publish_description(self, pr_title: str, pr_body: str): self.mr.description = pr_body self.mr.save() except Exception as e: - logging.exception(f"Could not update merge request {self.id_mr} description: {e}") + get_logger().exception(f"Could not update merge request {self.id_mr} description: {e}") + + def get_latest_commit_url(self): + return self.mr.commits().next().web_url + + def get_comment_url(self, comment): + return f"{self.mr.web_url}#note_{comment.id}" + + def publish_persistent_comment(self, pr_comment: str, + initial_header: str, + update_header: bool = True, + name='review', + final_update_message=True): + self.publish_persistent_comment_full(pr_comment, initial_header, update_header, name, final_update_message) def publish_comment(self, mr_comment: str, is_temporary: bool = False): + mr_comment = self.limit_output_characters(mr_comment, self.max_comment_chars) comment = self.mr.notes.create({'body': mr_comment}) if is_temporary: self.temp_comments.append(comment) + return comment + + def edit_comment(self, comment, body: str): + body = self.limit_output_characters(body, self.max_comment_chars) + self.mr.notes.update(comment.id,{'body': body} ) + + def edit_comment_from_comment_id(self, comment_id: int, body: str): + body = self.limit_output_characters(body, self.max_comment_chars) + comment = self.mr.notes.get(comment_id) + comment.body = body + comment.save() - def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + def reply_to_comment_from_comment_id(self, comment_id: int, body: str): + body = self.limit_output_characters(body, self.max_comment_chars) + discussion = self.mr.discussions.get(comment_id) + discussion.notes.create({'body': body}) + + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): + body = self.limit_output_characters(body, self.max_comment_chars) edit_type, found, source_line_no, target_file, target_line_no = self.search_line(relevant_file, relevant_line_in_file) self.send_inline_comment(body, edit_type, found, relevant_file, relevant_line_in_file, source_line_no, - target_file, target_line_no) + target_file, target_line_no, original_suggestion) - def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, absolute_position: int = None): raise NotImplementedError("Gitlab provider does not support creating inline comments yet") def create_inline_comments(self, comments: list[dict]): raise NotImplementedError("Gitlab provider does not support publishing inline comments yet") - def send_inline_comment(self, body, edit_type, found, relevant_file, relevant_line_in_file, source_line_no, - target_file, target_line_no): + def get_comment_body_from_comment_id(self, comment_id: int): + comment = self.mr.notes.get(comment_id).body + return comment + + def send_inline_comment(self, body: str, edit_type: str, found: bool, relevant_file: str, + relevant_line_in_file: str, + source_line_no: int, target_file: str, target_line_no: int, + original_suggestion=None) -> None: if not found: - logging.info(f"Could not find position for {relevant_file} {relevant_line_in_file}") + get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") else: - d = self.last_diff + # in order to have exact sha's we have to find correct diff for this change + diff = self.get_relevant_diff(relevant_file, relevant_line_in_file) + if diff is None: + get_logger().error(f"Could not get diff for merge request {self.id_mr}") + raise DiffNotFoundError(f"Could not get diff for merge request {self.id_mr}") pos_obj = {'position_type': 'text', 'new_path': target_file.filename, 'old_path': target_file.old_filename if target_file.old_filename else target_file.filename, - 'base_sha': d.base_commit_sha, 'start_sha': d.start_commit_sha, 'head_sha': d.head_commit_sha} + 'base_sha': diff.base_commit_sha, 'start_sha': diff.start_commit_sha, 'head_sha': diff.head_commit_sha} if edit_type == 'deletion': pos_obj['old_line'] = source_line_no - 1 elif edit_type == 'addition': @@ -167,13 +251,86 @@ def send_inline_comment(self, body, edit_type, found, relevant_file, relevant_li else: pos_obj['new_line'] = target_line_no - 1 pos_obj['old_line'] = source_line_no - 1 - logging.debug(f"Creating comment in {self.id_mr} with body {body} and position {pos_obj}") - self.mr.discussions.create({'body': body, - 'position': pos_obj}) - - def publish_code_suggestions(self, code_suggestions: list): + get_logger().debug(f"Creating comment in MR {self.id_mr} with body {body} and position {pos_obj}") + try: + self.mr.discussions.create({'body': body, 'position': pos_obj}) + except Exception as e: + try: + # fallback - create a general note on the file in the MR + if 'suggestion_orig_location' in original_suggestion: + line_start = original_suggestion['suggestion_orig_location']['start_line'] + line_end = original_suggestion['suggestion_orig_location']['end_line'] + old_code_snippet = original_suggestion['prev_code_snippet'] + new_code_snippet = original_suggestion['new_code_snippet'] + content = original_suggestion['suggestion_summary'] + label = original_suggestion['category'] + if 'score' in original_suggestion: + score = original_suggestion['score'] + else: + score = 7 + else: + line_start = original_suggestion['relevant_lines_start'] + line_end = original_suggestion['relevant_lines_end'] + old_code_snippet = original_suggestion['existing_code'] + new_code_snippet = original_suggestion['improved_code'] + content = original_suggestion['suggestion_content'] + label = original_suggestion['label'] + if 'score' in original_suggestion: + score = original_suggestion['score'] + else: + score = 7 + + if hasattr(self, 'main_language'): + language = self.main_language + else: + language = '' + link = self.get_line_link(relevant_file, line_start, line_end) + body_fallback =f"**Suggestion:** {content} [{label}, importance: {score}]\n___\n" + body_fallback +=f"\n\nReplace lines ([{line_start}-{line_end}]({link}))\n\n```{language}\n{old_code_snippet}\n````\n\n" + body_fallback +=f"with\n\n```{language}\n{new_code_snippet}\n````" + body_fallback += f"\n\n___\n\n`(Cannot implement this suggestion directly, as gitlab API does not enable committing to a non -+ line in a PR)`" + + # Create a general note on the file in the MR + self.mr.notes.create({ + 'body': body_fallback, + 'position': { + 'base_sha': diff.base_commit_sha, + 'start_sha': diff.start_commit_sha, + 'head_sha': diff.head_commit_sha, + 'position_type': 'text', + 'file_path': f'{target_file.filename}', + } + }) + + # get_logger().debug( + # f"Failed to create comment in MR {self.id_mr} with position {pos_obj} (probably not a '+' line)") + except Exception as e: + get_logger().exception(f"Failed to create comment in MR {self.id_mr}") + + def get_relevant_diff(self, relevant_file: str, relevant_line_in_file: str) -> Optional[dict]: + changes = self.mr.changes() # Retrieve the changes for the merge request once + if not changes: + get_logger().error('No changes found for the merge request.') + return None + all_diffs = self.mr.diffs.list(get_all=True) + if not all_diffs: + get_logger().error('No diffs found for the merge request.') + return None + for diff in all_diffs: + for change in changes['changes']: + if change['new_path'] == relevant_file and relevant_line_in_file in change['diff']: + return diff + get_logger().debug( + f'No relevant diff found for {relevant_file} {relevant_line_in_file}. Falling back to last diff.') + return self.last_diff # fallback to last_diff if no relevant diff is found + + def publish_code_suggestions(self, code_suggestions: list) -> bool: for suggestion in code_suggestions: try: + if suggestion and 'original_suggestion' in suggestion: + original_suggestion = suggestion['original_suggestion'] + else: + original_suggestion = suggestion body = suggestion['body'] relevant_file = suggestion['relevant_file'] relevant_lines_start = suggestion['relevant_lines_start'] @@ -194,15 +351,21 @@ def publish_code_suggestions(self, code_suggestions: list): # edit_type, found, source_line_no, target_file, target_line_no = self.find_in_file(target_file, # relevant_line_in_file) # for code suggestions, we want to edit the new code - source_line_no = None + source_line_no = -1 target_line_no = relevant_lines_start + 1 found = True edit_type = 'addition' self.send_inline_comment(body, edit_type, found, relevant_file, relevant_line_in_file, source_line_no, - target_file, target_line_no) + target_file, target_line_no, original_suggestion) except Exception as e: - logging.exception(f"Could not publish code suggestion:\nsuggestion: {suggestion}\nerror: {e}") + get_logger().exception(f"Could not publish code suggestion:\nsuggestion: {suggestion}\nerror: {e}") + + # note that we publish suggestions one-by-one. so, if one fails, the rest will still be published + return True + + def publish_file_comments(self, file_comments: list) -> bool: + pass def search_line(self, relevant_file, relevant_line_in_file): target_file = None @@ -261,9 +424,15 @@ def get_edit_type(self, relevant_line_in_file): def remove_initial_comment(self): try: for comment in self.temp_comments: - comment.delete() + self.remove_comment(comment) + except Exception as e: + get_logger().exception(f"Failed to remove temp comments, error: {e}") + + def remove_comment(self, comment): + try: + comment.delete() except Exception as e: - logging.exception(f"Failed to remove temp comments, error: {e}") + get_logger().exception(f"Failed to remove comment, error: {e}") def get_title(self): return self.mr.title @@ -275,23 +444,32 @@ def get_languages(self): def get_pr_branch(self): return self.mr.source_branch - def get_pr_description(self): - max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) - if max_tokens: - return clip_tokens(self.mr.description, max_tokens) + def get_pr_owner_id(self) -> str | None: + if not self.gitlab_url or 'gitlab.com' in self.gitlab_url: + if not self.id_project: + return None + return self.id_project.split('/')[0] + # extract host name + host = urlparse(self.gitlab_url).hostname + return host + + def get_pr_description_full(self): return self.mr.description def get_issue_comments(self): - raise NotImplementedError("GitLab provider does not support issue comments yet") + return self.mr.notes.list(get_all=True)[::-1] def get_repo_settings(self): try: - contents = self.gl.projects.get(self.id_project).files.get(file_path='.pr_agent.toml', ref=self.mr.source_branch) + contents = self.gl.projects.get(self.id_project).files.get(file_path='.pr_agent.toml', ref=self.mr.target_branch).decode() return contents except Exception: return "" - def add_eyes_reaction(self, issue_comment_id: int) -> Optional[int]: + def get_workspace_name(self): + return self.id_project.split('/')[0] + + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: return True def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: @@ -334,14 +512,17 @@ def publish_labels(self, pr_types): self.mr.labels = list(set(pr_types)) self.mr.save() except Exception as e: - logging.exception(f"Failed to publish labels, error: {e}") + get_logger().warning(f"Failed to publish labels, error: {e}") def publish_inline_comments(self, comments: list[dict]): pass - def get_labels(self): + def get_pr_labels(self, update=False): return self.mr.labels + def get_repo_labels(self): + return self.gl.projects.get(self.id_project).labels.list() + def get_commit_messages(self): """ Retrieves the commit messages of a pull request. @@ -357,4 +538,45 @@ def get_commit_messages(self): commit_messages_str = "" if max_tokens: commit_messages_str = clip_tokens(commit_messages_str, max_tokens) - return commit_messages_str \ No newline at end of file + return commit_messages_str + + def get_pr_id(self): + try: + pr_id = self.mr.web_url + return pr_id + except: + return "" + + def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: + if relevant_line_start == -1: + link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads" + elif relevant_line_end: + link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads#L{relevant_line_start}-{relevant_line_end}" + else: + link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads#L{relevant_line_start}" + return link + + + def generate_link_to_relevant_line_number(self, suggestion) -> str: + try: + relevant_file = suggestion['relevant_file'].strip('`').strip("'").rstrip() + relevant_line_str = suggestion['relevant_line'].rstrip() + if not relevant_line_str: + return "" + + position, absolute_position = find_line_number_of_relevant_line_in_file \ + (self.diff_files, relevant_file, relevant_line_str) + + if absolute_position != -1: + # link to right file only + link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads#L{absolute_position}" + + # # link to diff + # sha_file = hashlib.sha1(relevant_file.encode('utf-8')).hexdigest() + # link = f"{self.pr.web_url}/diffs#{sha_file}_{absolute_position}_{absolute_position}" + return link + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Failed adding line link, error: {e}") + + return "" diff --git a/pr_agent/git_providers/local_git_provider.py b/pr_agent/git_providers/local_git_provider.py index a4f21969a..c104224ac 100644 --- a/pr_agent/git_providers/local_git_provider.py +++ b/pr_agent/git_providers/local_git_provider.py @@ -1,4 +1,3 @@ -import logging from collections import Counter from pathlib import Path from typing import List @@ -6,7 +5,9 @@ from git import Repo from pr_agent.config_loader import _find_repository_root, get_settings -from pr_agent.git_providers.git_provider import EDIT_TYPE, FilePatchInfo, GitProvider +from pr_agent.git_providers.git_provider import GitProvider +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo +from pr_agent.log import get_logger class PullRequestMimic: @@ -49,14 +50,15 @@ def _prepare_repo(self): """ Prepare the repository for PR-mimic generation. """ - logging.debug('Preparing repository for PR-mimic generation...') + get_logger().debug('Preparing repository for PR-mimic generation...') if self.repo.is_dirty(): raise ValueError('The repository is not in a clean state. Please commit or stash pending changes.') if self.target_branch_name not in self.repo.heads: raise KeyError(f'Branch: {self.target_branch_name} does not exist') def is_supported(self, capability: str) -> bool: - if capability in ['get_issue_comments', 'create_inline_comment', 'publish_inline_comments', 'get_labels']: + if capability in ['get_issue_comments', 'create_inline_comment', 'publish_inline_comments', 'get_labels', + 'gfm_markdown']: return False return True @@ -117,12 +119,9 @@ def publish_comment(self, pr_comment: str, is_temporary: bool = False): # Write the string to the file file.write(pr_comment) - def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): raise NotImplementedError('Publishing inline comments is not implemented for the local git provider') - def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): - raise NotImplementedError('Creating inline comments is not implemented for the local git provider') - def publish_inline_comments(self, comments: list[dict]): raise NotImplementedError('Publishing inline comments is not implemented for the local git provider') @@ -130,7 +129,7 @@ def publish_code_suggestion(self, body: str, relevant_file: str, relevant_lines_start: int, relevant_lines_end: int): raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') def publish_labels(self, labels): @@ -139,6 +138,9 @@ def publish_labels(self, labels): def remove_initial_comment(self): pass # Not applicable to the local git provider, but required by the interface + def remove_comment(self, comment): + pass # Not applicable to the local git provider, but required by the interface + def get_languages(self): """ Calculate percentage of languages in repository. Used for hunk prioritisation. @@ -158,7 +160,7 @@ def get_pr_branch(self): def get_user_id(self): return -1 # Not used anywhere for the local provider, but required by the interface - def get_pr_description(self): + def get_pr_description_full(self): commits_diff = list(self.repo.iter_commits(self.target_branch_name + '..HEAD')) # Get the commit messages and concatenate commit_messages = " ".join([commit.message for commit in commits_diff]) @@ -174,5 +176,5 @@ def get_pr_title(self): def get_issue_comments(self): raise NotImplementedError('Getting issue comments is not implemented for the local git provider') - def get_labels(self): + def get_pr_labels(self, update=False): raise NotImplementedError('Getting labels is not implemented for the local git provider') diff --git a/pr_agent/git_providers/utils.py b/pr_agent/git_providers/utils.py new file mode 100644 index 000000000..8a9579cff --- /dev/null +++ b/pr_agent/git_providers/utils.py @@ -0,0 +1,63 @@ +import copy +import os +import tempfile + +from dynaconf import Dynaconf + +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider, get_git_provider_with_context +from pr_agent.log import get_logger +from starlette_context import context + + +def apply_repo_settings(pr_url): + git_provider = get_git_provider_with_context(pr_url) + if get_settings().config.use_repo_settings_file: + repo_settings_file = None + try: + try: + repo_settings = context.get("repo_settings", None) + except Exception: + repo_settings = None + pass + if repo_settings is None: # None is different from "", which is a valid value + repo_settings = git_provider.get_repo_settings() + try: + context["repo_settings"] = repo_settings + except Exception: + pass + + if repo_settings: + repo_settings_file = None + fd, repo_settings_file = tempfile.mkstemp(suffix='.toml') + os.write(fd, repo_settings) + new_settings = Dynaconf(settings_files=[repo_settings_file]) + for section, contents in new_settings.as_dict().items(): + section_dict = copy.deepcopy(get_settings().as_dict().get(section, {})) + for key, value in contents.items(): + section_dict[key] = value + get_settings().unset(section) + get_settings().set(section, section_dict, merge=False) + get_logger().info(f"Applying repo settings:\n{new_settings.as_dict()}") + except Exception as e: + get_logger().exception("Failed to apply repo settings", e) + finally: + if repo_settings_file: + try: + os.remove(repo_settings_file) + except Exception as e: + get_logger().error(f"Failed to remove temporary settings file {repo_settings_file}", e) + + # enable switching models with a short definition + if get_settings().config.model.lower()=='claude-3-5-sonnet': + set_claude_model() + + +def set_claude_model(): + """ + set the claude-sonnet-3.5 model easily (even by users), just by stating: --config.model='claude-3-5-sonnet' + """ + model_claude = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0" + get_settings().set('config.model', model_claude) + get_settings().set('config.model_turbo', model_claude) + get_settings().set('config.fallback_models', [model_claude]) diff --git a/pr_agent/identity_providers/__init__.py b/pr_agent/identity_providers/__init__.py new file mode 100644 index 000000000..6df37ecb6 --- /dev/null +++ b/pr_agent/identity_providers/__init__.py @@ -0,0 +1,13 @@ +from pr_agent.config_loader import get_settings +from pr_agent.identity_providers.default_identity_provider import DefaultIdentityProvider + +_IDENTITY_PROVIDERS = { + 'default': DefaultIdentityProvider +} + + +def get_identity_provider(): + identity_provider_id = get_settings().get("CONFIG.IDENTITY_PROVIDER", "default") + if identity_provider_id not in _IDENTITY_PROVIDERS: + raise ValueError(f"Unknown identity provider: {identity_provider_id}") + return _IDENTITY_PROVIDERS[identity_provider_id]() \ No newline at end of file diff --git a/pr_agent/identity_providers/default_identity_provider.py b/pr_agent/identity_providers/default_identity_provider.py new file mode 100644 index 000000000..c542e1c28 --- /dev/null +++ b/pr_agent/identity_providers/default_identity_provider.py @@ -0,0 +1,9 @@ +from pr_agent.identity_providers.identity_provider import Eligibility, IdentityProvider + + +class DefaultIdentityProvider(IdentityProvider): + def verify_eligibility(self, git_provider, git_provider_id, pr_url): + return Eligibility.ELIGIBLE + + def inc_invocation_count(self, git_provider, git_provider_id): + pass diff --git a/pr_agent/identity_providers/identity_provider.py b/pr_agent/identity_providers/identity_provider.py new file mode 100644 index 000000000..58e5f6c63 --- /dev/null +++ b/pr_agent/identity_providers/identity_provider.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from enum import Enum + + +class Eligibility(Enum): + NOT_ELIGIBLE = 0 + ELIGIBLE = 1 + TRIAL = 2 + + +class IdentityProvider(ABC): + @abstractmethod + def verify_eligibility(self, git_provider, git_provier_id, pr_url): + pass + + @abstractmethod + def inc_invocation_count(self, git_provider, git_provider_id): + pass diff --git a/pr_agent/log/__init__.py b/pr_agent/log/__init__.py new file mode 100644 index 000000000..53430ea1a --- /dev/null +++ b/pr_agent/log/__init__.py @@ -0,0 +1,65 @@ +import json +import logging +import os +import sys +from enum import Enum + +from loguru import logger + +from pr_agent.config_loader import get_settings + + +class LoggingFormat(str, Enum): + CONSOLE = "CONSOLE" + JSON = "JSON" + + +def json_format(record: dict) -> str: + return record["message"] + + +def analytics_filter(record: dict) -> bool: + return record.get("extra", {}).get("analytics", False) + + +def inv_analytics_filter(record: dict) -> bool: + return not record.get("extra", {}).get("analytics", False) + + +def setup_logger(level: str = "INFO", fmt: LoggingFormat = LoggingFormat.CONSOLE): + level: int = logging.getLevelName(level.upper()) + if type(level) is not int: + level = logging.INFO + + if fmt == LoggingFormat.JSON and os.getenv("LOG_SANE", "0").lower() == "0": # better debugging github_app + logger.remove(None) + logger.add( + sys.stdout, + filter=inv_analytics_filter, + level=level, + format="{message}", + colorize=False, + serialize=True, + ) + elif fmt == LoggingFormat.CONSOLE: # does not print the 'extra' fields + logger.remove(None) + logger.add(sys.stdout, level=level, colorize=True, filter=inv_analytics_filter) + + log_folder = get_settings().get("CONFIG.ANALYTICS_FOLDER", "") + if log_folder: + pid = os.getpid() + log_file = os.path.join(log_folder, f"pr-agent.{pid}.log") + logger.add( + log_file, + filter=analytics_filter, + level=level, + format="{message}", + colorize=False, + serialize=True, + ) + + return logger + + +def get_logger(*args, **kwargs): + return logger diff --git a/pr_agent/secret_providers/__init__.py b/pr_agent/secret_providers/__init__.py new file mode 100644 index 000000000..020ed16c4 --- /dev/null +++ b/pr_agent/secret_providers/__init__.py @@ -0,0 +1,19 @@ +from pr_agent.config_loader import get_settings + + +def get_secret_provider(): + if not get_settings().get("CONFIG.SECRET_PROVIDER"): + return None + + provider_id = get_settings().config.secret_provider + if provider_id == 'google_cloud_storage': + try: + from pr_agent.secret_providers.google_cloud_storage_secret_provider import GoogleCloudStorageSecretProvider + return GoogleCloudStorageSecretProvider() + except Exception as e: + raise ValueError(f"Failed to initialize google_cloud_storage secret provider {provider_id}") from e + else: + raise ValueError("Unknown SECRET_PROVIDER") + + + diff --git a/pr_agent/secret_providers/google_cloud_storage_secret_provider.py b/pr_agent/secret_providers/google_cloud_storage_secret_provider.py new file mode 100644 index 000000000..8cbaebe31 --- /dev/null +++ b/pr_agent/secret_providers/google_cloud_storage_secret_provider.py @@ -0,0 +1,34 @@ +import ujson +from google.cloud import storage + +from pr_agent.config_loader import get_settings +from pr_agent.log import get_logger +from pr_agent.secret_providers.secret_provider import SecretProvider + + +class GoogleCloudStorageSecretProvider(SecretProvider): + def __init__(self): + try: + self.client = storage.Client.from_service_account_info(ujson.loads(get_settings().google_cloud_storage. + service_account)) + self.bucket_name = get_settings().google_cloud_storage.bucket_name + self.bucket = self.client.bucket(self.bucket_name) + except Exception as e: + get_logger().error(f"Failed to initialize Google Cloud Storage Secret Provider: {e}") + raise e + + def get_secret(self, secret_name: str) -> str: + try: + blob = self.bucket.blob(secret_name) + return blob.download_as_string() + except Exception as e: + get_logger().warning(f"Failed to get secret {secret_name} from Google Cloud Storage: {e}") + return "" + + def store_secret(self, secret_name: str, secret_value: str): + try: + blob = self.bucket.blob(secret_name) + blob.upload_from_string(secret_value) + except Exception as e: + get_logger().error(f"Failed to store secret {secret_name} in Google Cloud Storage: {e}") + raise e diff --git a/pr_agent/secret_providers/secret_provider.py b/pr_agent/secret_providers/secret_provider.py new file mode 100644 index 000000000..df1e77806 --- /dev/null +++ b/pr_agent/secret_providers/secret_provider.py @@ -0,0 +1,12 @@ +from abc import ABC, abstractmethod + + +class SecretProvider(ABC): + + @abstractmethod + def get_secret(self, secret_name: str) -> str: + pass + + @abstractmethod + def store_secret(self, secret_name: str, secret_value: str): + pass diff --git a/pr_agent/servers/__init__.py b/pr_agent/servers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pr_agent/servers/atlassian-connect.json b/pr_agent/servers/atlassian-connect.json new file mode 100644 index 000000000..1ff508652 --- /dev/null +++ b/pr_agent/servers/atlassian-connect.json @@ -0,0 +1,33 @@ +{ + "name": "CodiumAI PR-Agent", + "description": "CodiumAI PR-Agent", + "key": "app_key", + "vendor": { + "name": "CodiumAI", + "url": "https://codium.ai" + }, + "authentication": { + "type": "jwt" + }, + "baseUrl": "base_url", + "lifecycle": { + "installed": "/installed", + "uninstalled": "/uninstalled" + }, + "scopes": [ + "account", + "repository", + "pullrequest" + ], + "contexts": [ + "account" + ], + "modules": { + "webhooks": [ + { + "event": "*", + "url": "/webhook" + } + ] + } +} \ No newline at end of file diff --git a/pr_agent/servers/azuredevops_server_webhook.py b/pr_agent/servers/azuredevops_server_webhook.py new file mode 100644 index 000000000..37446659a --- /dev/null +++ b/pr_agent/servers/azuredevops_server_webhook.py @@ -0,0 +1,148 @@ +# This file contains the code for the Azure DevOps Server webhook server. +# The server listens for incoming webhooks from Azure DevOps Server and forwards them to the PR Agent. +# ADO webhook documentation: https://learn.microsoft.com/en-us/azure/devops/service-hooks/services/webhooks?view=azure-devops + +import json +import os +import re +import secrets +from urllib.parse import unquote + +import uvicorn +from fastapi import APIRouter, Depends, FastAPI, HTTPException +from fastapi.security import HTTPBasic, HTTPBasicCredentials +from fastapi.encoders import jsonable_encoder +from starlette import status +from starlette.background import BackgroundTasks +from starlette.middleware import Middleware +from starlette.requests import Request +from starlette.responses import JSONResponse +from starlette_context.middleware import RawContextMiddleware + +from pr_agent.agent.pr_agent import PRAgent, command2class +from pr_agent.algo.utils import update_settings_from_args +from pr_agent.config_loader import get_settings +from pr_agent.git_providers.utils import apply_repo_settings +from pr_agent.log import get_logger +from fastapi import Request, Depends +from fastapi.security import HTTPBasic, HTTPBasicCredentials +from pr_agent.log import LoggingFormat, get_logger, setup_logger + +setup_logger(fmt=LoggingFormat.JSON, level="DEBUG") +security = HTTPBasic() +router = APIRouter() +available_commands_rgx = re.compile(r"^\/(" + "|".join(command2class.keys()) + r")\s*") +azure_devops_server = get_settings().get("azure_devops_server") +WEBHOOK_USERNAME = azure_devops_server.get("webhook_username") +WEBHOOK_PASSWORD = azure_devops_server.get("webhook_password") + +def handle_request( + background_tasks: BackgroundTasks, url: str, body: str, log_context: dict +): + log_context["action"] = body + log_context["api_url"] = url + + async def inner(): + try: + with get_logger().contextualize(**log_context): + await PRAgent().handle_request(url, body) + except Exception as e: + get_logger().error(f"Failed to handle webhook: {e}") + + background_tasks.add_task(inner) + + +# currently only basic auth is supported with azure webhooks +# for this reason, https must be enabled to ensure the credentials are not sent in clear text +def authorize(credentials: HTTPBasicCredentials = Depends(security)): + is_user_ok = secrets.compare_digest(credentials.username, WEBHOOK_USERNAME) + is_pass_ok = secrets.compare_digest(credentials.password, WEBHOOK_PASSWORD) + if not (is_user_ok and is_pass_ok): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail='Incorrect username or password.', + headers={'WWW-Authenticate': 'Basic'}, + ) + + +async def _perform_commands_azure(commands_conf: str, agent: PRAgent, api_url: str, log_context: dict): + apply_repo_settings(api_url) + commands = get_settings().get(f"azure_devops_server.{commands_conf}") + get_settings().set("config.is_auto_command", True) + for command in commands: + try: + split_command = command.split(" ") + command = split_command[0] + args = split_command[1:] + other_args = update_settings_from_args(args) + new_command = ' '.join([command] + other_args) + get_logger().info(f"Performing command: {new_command}") + with get_logger().contextualize(**log_context): + await agent.handle_request(api_url, new_command) + except Exception as e: + get_logger().error(f"Failed to perform command {command}: {e}") + + +@router.post("/", dependencies=[Depends(authorize)]) +async def handle_webhook(background_tasks: BackgroundTasks, request: Request): + log_context = {"server_type": "azure_devops_server"} + data = await request.json() + get_logger().info(json.dumps(data)) + + actions = [] + if data["eventType"] == "git.pullrequest.created": + # API V1 (latest) + pr_url = unquote(data["resource"]["_links"]["web"]["href"].replace("_apis/git/repositories", "_git")) + log_context["event"] = data["eventType"] + log_context["api_url"] = pr_url + await _perform_commands_azure("pr_commands", PRAgent(), pr_url, log_context) + return + elif data["eventType"] == "ms.vss-code.git-pullrequest-comment-event" and "content" in data["resource"]["comment"]: + if available_commands_rgx.match(data["resource"]["comment"]["content"]): + if(data["resourceVersion"] == "2.0"): + repo = data["resource"]["pullRequest"]["repository"]["webUrl"] + pr_url = unquote(f'{repo}/pullrequest/{data["resource"]["pullRequest"]["pullRequestId"]}') + actions = [data["resource"]["comment"]["content"]] + else: + # API V1 not supported as it does not contain the PR URL + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content=json.dumps({"message": "version 1.0 webhook for Azure Devops PR comment is not supported. please upgrade to version 2.0"})), + else: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content=json.dumps({"message": "Unsupported command"}), + ) + else: + return JSONResponse( + status_code=status.HTTP_204_NO_CONTENT, + content=json.dumps({"message": "Unsupported event"}), + ) + + log_context["event"] = data["eventType"] + log_context["api_url"] = pr_url + + for action in actions: + try: + handle_request(background_tasks, pr_url, action, log_context) + except Exception as e: + get_logger().error("Azure DevOps Trigger failed. Error:" + str(e)) + return JSONResponse( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + content=json.dumps({"message": "Internal server error"}), + ) + return JSONResponse( + status_code=status.HTTP_202_ACCEPTED, content=jsonable_encoder({"message": "webhook triggered successfully"}) + ) + +@router.get("/") +async def root(): + return {"status": "ok"} + +def start(): + app = FastAPI(middleware=[Middleware(RawContextMiddleware)]) + app.include_router(router) + uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "3000"))) + +if __name__ == "__main__": + start() diff --git a/pr_agent/servers/bitbucket_app.py b/pr_agent/servers/bitbucket_app.py new file mode 100644 index 000000000..a0384da14 --- /dev/null +++ b/pr_agent/servers/bitbucket_app.py @@ -0,0 +1,254 @@ +import base64 +import copy +import hashlib +import json +import os +import re +import time + +import jwt +import requests +import uvicorn +from fastapi import APIRouter, FastAPI, Request, Response +from starlette.background import BackgroundTasks +from starlette.middleware import Middleware +from starlette.responses import JSONResponse +from starlette_context import context +from starlette_context.middleware import RawContextMiddleware + +from pr_agent.agent.pr_agent import PRAgent +from pr_agent.algo.utils import update_settings_from_args +from pr_agent.config_loader import get_settings, global_settings +from pr_agent.git_providers.utils import apply_repo_settings +from pr_agent.identity_providers import get_identity_provider +from pr_agent.identity_providers.identity_provider import Eligibility +from pr_agent.log import LoggingFormat, get_logger, setup_logger +from pr_agent.secret_providers import get_secret_provider +from pr_agent.servers.github_action_runner import get_setting_or_env, is_true +from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions +from pr_agent.tools.pr_description import PRDescription +from pr_agent.tools.pr_reviewer import PRReviewer + +setup_logger(fmt=LoggingFormat.JSON, level="DEBUG") +router = APIRouter() +secret_provider = get_secret_provider() if get_settings().get("CONFIG.SECRET_PROVIDER") else None + + +async def get_bearer_token(shared_secret: str, client_key: str): + try: + now = int(time.time()) + url = "https://bitbucket.org/site/oauth2/access_token" + canonical_url = "GET&/site/oauth2/access_token&" + qsh = hashlib.sha256(canonical_url.encode("utf-8")).hexdigest() + app_key = get_settings().bitbucket.app_key + + payload = { + "iss": app_key, + "iat": now, + "exp": now + 240, + "qsh": qsh, + "sub": client_key, + } + token = jwt.encode(payload, shared_secret, algorithm="HS256") + payload = 'grant_type=urn%3Abitbucket%3Aoauth2%3Ajwt' + headers = { + 'Authorization': f'JWT {token}', + 'Content-Type': 'application/x-www-form-urlencoded' + } + response = requests.request("POST", url, headers=headers, data=payload) + bearer_token = response.json()["access_token"] + return bearer_token + except Exception as e: + get_logger().error(f"Failed to get bearer token: {e}") + raise e + +@router.get("/") +async def handle_manifest(request: Request, response: Response): + cur_dir = os.path.dirname(os.path.abspath(__file__)) + manifest = open(os.path.join(cur_dir, "atlassian-connect.json"), "rt").read() + try: + manifest = manifest.replace("app_key", get_settings().bitbucket.app_key) + manifest = manifest.replace("base_url", get_settings().bitbucket.base_url) + except: + get_logger().error("Failed to replace api_key in Bitbucket manifest, trying to continue") + manifest_obj = json.loads(manifest) + return JSONResponse(manifest_obj) + + +async def _perform_commands_bitbucket(commands_conf: str, agent: PRAgent, api_url: str, log_context: dict): + apply_repo_settings(api_url) + commands = get_settings().get(f"bitbucket_app.{commands_conf}", {}) + get_settings().set("config.is_auto_command", True) + for command in commands: + try: + split_command = command.split(" ") + command = split_command[0] + args = split_command[1:] + other_args = update_settings_from_args(args) + new_command = ' '.join([command] + other_args) + get_logger().info(f"Performing command: {new_command}") + with get_logger().contextualize(**log_context): + await agent.handle_request(api_url, new_command) + except Exception as e: + get_logger().error(f"Failed to perform command {command}: {e}") + + +def is_bot_user(data) -> bool: + try: + if data["data"]["actor"]["type"] != "user": + get_logger().info(f"BitBucket actor type is not 'user': {data['data']['actor']['type']}") + return True + except Exception as e: + get_logger().error("Failed 'is_bot_user' logic: {e}") + return False + + +def should_process_pr_logic(data) -> bool: + try: + pr_data = data.get("data", {}).get("pullrequest", {}) + title = pr_data.get("title", "") + source_branch = pr_data.get("source", {}).get("branch", {}).get("name", "") + target_branch = pr_data.get("destination", {}).get("branch", {}).get("name", "") + + # logic to ignore PRs with specific titles + if title: + ignore_pr_title_re = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) + if not isinstance(ignore_pr_title_re, list): + ignore_pr_title_re = [ignore_pr_title_re] + if ignore_pr_title_re and any(re.search(regex, title) for regex in ignore_pr_title_re): + get_logger().info(f"Ignoring PR with title '{title}' due to config.ignore_pr_title setting") + return False + + ignore_pr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) + ignore_pr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) + if (ignore_pr_source_branches or ignore_pr_target_branches): + if any(re.search(regex, source_branch) for regex in ignore_pr_source_branches): + get_logger().info( + f"Ignoring PR with source branch '{source_branch}' due to config.ignore_pr_source_branches settings") + return False + if any(re.search(regex, target_branch) for regex in ignore_pr_target_branches): + get_logger().info( + f"Ignoring PR with target branch '{target_branch}' due to config.ignore_pr_target_branches settings") + return False + except Exception as e: + get_logger().error(f"Failed 'should_process_pr_logic': {e}") + return True + + +@router.post("/webhook") +async def handle_github_webhooks(background_tasks: BackgroundTasks, request: Request): + app_name = get_settings().get("CONFIG.APP_NAME", "Unknown") + log_context = {"server_type": "bitbucket_app", "app_name": app_name} + get_logger().debug(request.headers) + jwt_header = request.headers.get("authorization", None) + if jwt_header: + input_jwt = jwt_header.split(" ")[1] + data = await request.json() + get_logger().debug(data) + + async def inner(): + try: + # ignore bot users + if is_bot_user(data): + return "OK" + + # Check if the PR should be processed + if data.get("event", "") == "pullrequest:created": + if not should_process_pr_logic(data): + return "OK" + + # Get the username of the sender + try: + username = data["data"]["actor"]["username"] + except KeyError: + try: + username = data["data"]["actor"]["display_name"] + except KeyError: + username = data["data"]["actor"]["nickname"] + log_context["sender"] = username + + sender_id = data["data"]["actor"]["account_id"] + log_context["sender_id"] = sender_id + jwt_parts = input_jwt.split(".") + claim_part = jwt_parts[1] + claim_part += "=" * (-len(claim_part) % 4) + decoded_claims = base64.urlsafe_b64decode(claim_part) + claims = json.loads(decoded_claims) + client_key = claims["iss"] + secrets = json.loads(secret_provider.get_secret(client_key)) + shared_secret = secrets["shared_secret"] + jwt.decode(input_jwt, shared_secret, audience=client_key, algorithms=["HS256"]) + bearer_token = await get_bearer_token(shared_secret, client_key) + context['bitbucket_bearer_token'] = bearer_token + context["settings"] = copy.deepcopy(global_settings) + event = data["event"] + agent = PRAgent() + if event == "pullrequest:created": + pr_url = data["data"]["pullrequest"]["links"]["html"]["href"] + log_context["api_url"] = pr_url + log_context["event"] = "pull_request" + if pr_url: + with get_logger().contextualize(**log_context): + apply_repo_settings(pr_url) + if get_identity_provider().verify_eligibility("bitbucket", + sender_id, pr_url) is not Eligibility.NOT_ELIGIBLE: + if get_settings().get("bitbucket_app.pr_commands"): + await _perform_commands_bitbucket("pr_commands", PRAgent(), pr_url, log_context) + elif event == "pullrequest:comment_created": + pr_url = data["data"]["pullrequest"]["links"]["html"]["href"] + log_context["api_url"] = pr_url + log_context["event"] = "comment" + comment_body = data["data"]["comment"]["content"]["raw"] + with get_logger().contextualize(**log_context): + if get_identity_provider().verify_eligibility("bitbucket", + sender_id, pr_url) is not Eligibility.NOT_ELIGIBLE: + await agent.handle_request(pr_url, comment_body) + except Exception as e: + get_logger().error(f"Failed to handle webhook: {e}") + background_tasks.add_task(inner) + return "OK" + +@router.get("/webhook") +async def handle_github_webhooks(request: Request, response: Response): + return "Webhook server online!" + +@router.post("/installed") +async def handle_installed_webhooks(request: Request, response: Response): + try: + get_logger().info("handle_installed_webhooks") + get_logger().info(request.headers) + data = await request.json() + get_logger().info(data) + shared_secret = data["sharedSecret"] + client_key = data["clientKey"] + username = data["principal"]["username"] + secrets = { + "shared_secret": shared_secret, + "client_key": client_key + } + secret_provider.store_secret(username, json.dumps(secrets)) + except Exception as e: + get_logger().error(f"Failed to register user: {e}") + return JSONResponse({"error": "Unable to register user"}, status_code=500) + +@router.post("/uninstalled") +async def handle_uninstalled_webhooks(request: Request, response: Response): + get_logger().info("handle_uninstalled_webhooks") + + data = await request.json() + get_logger().info(data) + + +def start(): + get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) + get_settings().set("CONFIG.GIT_PROVIDER", "bitbucket") + get_settings().set("PR_DESCRIPTION.PUBLISH_DESCRIPTION_AS_COMMENT", True) + middleware = [Middleware(RawContextMiddleware)] + app = FastAPI(middleware=middleware) + app.include_router(router) + + uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "3000"))) + + +if __name__ == '__main__': + start() diff --git a/pr_agent/servers/bitbucket_server_webhook.py b/pr_agent/servers/bitbucket_server_webhook.py new file mode 100644 index 000000000..c9bfa5d9d --- /dev/null +++ b/pr_agent/servers/bitbucket_server_webhook.py @@ -0,0 +1,159 @@ +import ast +import json +import os +from typing import List + +import uvicorn +from fastapi import APIRouter, FastAPI +from fastapi.encoders import jsonable_encoder +from starlette import status +from starlette.background import BackgroundTasks +from starlette.middleware import Middleware +from starlette.requests import Request +from starlette.responses import JSONResponse +from starlette_context.middleware import RawContextMiddleware +from pr_agent.agent.pr_agent import PRAgent +from pr_agent.algo.utils import update_settings_from_args +from pr_agent.config_loader import get_settings +from pr_agent.git_providers.utils import apply_repo_settings +from pr_agent.log import LoggingFormat, get_logger, setup_logger +from pr_agent.servers.utils import verify_signature +from fastapi.responses import RedirectResponse + + +setup_logger(fmt=LoggingFormat.JSON, level="DEBUG") +router = APIRouter() + + +def handle_request( + background_tasks: BackgroundTasks, url: str, body: str, log_context: dict +): + log_context["action"] = body + log_context["api_url"] = url + + async def inner(): + try: + with get_logger().contextualize(**log_context): + await PRAgent().handle_request(url, body) + except Exception as e: + get_logger().error(f"Failed to handle webhook: {e}") + + background_tasks.add_task(inner) + +@router.post("/") +async def redirect_to_webhook(): + return RedirectResponse(url="/webhook") + +@router.post("/webhook") +async def handle_webhook(background_tasks: BackgroundTasks, request: Request): + log_context = {"server_type": "bitbucket_server"} + data = await request.json() + get_logger().info(json.dumps(data)) + + webhook_secret = get_settings().get("BITBUCKET_SERVER.WEBHOOK_SECRET", None) + if webhook_secret: + body_bytes = await request.body() + if body_bytes.decode('utf-8') == '{"test": true}': + return JSONResponse( + status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "connection test successful"}) + ) + signature_header = request.headers.get("x-hub-signature", None) + verify_signature(body_bytes, webhook_secret, signature_header) + + pr_id = data["pullRequest"]["id"] + repository_name = data["pullRequest"]["toRef"]["repository"]["slug"] + project_name = data["pullRequest"]["toRef"]["repository"]["project"]["key"] + bitbucket_server = get_settings().get("BITBUCKET_SERVER.URL") + pr_url = f"{bitbucket_server}/projects/{project_name}/repos/{repository_name}/pull-requests/{pr_id}" + + log_context["api_url"] = pr_url + log_context["event"] = "pull_request" + + commands_to_run = [] + + if data["eventKey"] == "pr:opened": + commands_to_run.extend(_get_commands_list_from_settings('BITBUCKET_SERVER.PR_COMMANDS')) + elif data["eventKey"] == "pr:comment:added": + commands_to_run.append(data["comment"]["text"]) + else: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content=json.dumps({"message": "Unsupported event"}), + ) + + async def inner(): + try: + await _run_commands_sequentially(commands_to_run, pr_url, log_context) + except Exception as e: + get_logger().error(f"Failed to handle webhook: {e}") + + background_tasks.add_task(inner) + + return JSONResponse( + status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"}) + ) + + +async def _run_commands_sequentially(commands: List[str], url: str, log_context: dict): + get_logger().info(f"Running commands sequentially: {commands}") + if commands is None: + return + + for command in commands: + try: + body = _process_command(command, url) + + log_context["action"] = body + log_context["api_url"] = url + + with get_logger().contextualize(**log_context): + await PRAgent().handle_request(url, body) + except Exception as e: + get_logger().error(f"Failed to handle command: {command} , error: {e}") + +def _process_command(command: str, url) -> str: + # don't think we need this + apply_repo_settings(url) + # Process the command string + split_command = command.split(" ") + command = split_command[0] + args = split_command[1:] + # do I need this? if yes, shouldn't this be done in PRAgent? + other_args = update_settings_from_args(args) + new_command = ' '.join([command] + other_args) + return new_command + + +def _to_list(command_string: str) -> list: + try: + # Use ast.literal_eval to safely parse the string into a list + commands = ast.literal_eval(command_string) + # Check if the parsed object is a list of strings + if isinstance(commands, list) and all(isinstance(cmd, str) for cmd in commands): + return commands + else: + raise ValueError("Parsed data is not a list of strings.") + except (SyntaxError, ValueError, TypeError) as e: + raise ValueError(f"Invalid command string: {e}") + + +def _get_commands_list_from_settings(setting_key:str ) -> list: + try: + return get_settings().get(setting_key, []) + except ValueError as e: + get_logger().error(f"Failed to get commands list from settings {setting_key}: {e}") + + +@router.get("/") +async def root(): + return {"status": "ok"} + + +def start(): + app = FastAPI(middleware=[Middleware(RawContextMiddleware)]) + app.include_router(router) + uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "3000"))) + + +if __name__ == "__main__": + start() diff --git a/pr_agent/servers/gerrit_server.py b/pr_agent/servers/gerrit_server.py new file mode 100644 index 000000000..1783f6b99 --- /dev/null +++ b/pr_agent/servers/gerrit_server.py @@ -0,0 +1,77 @@ +import copy +from enum import Enum +from json import JSONDecodeError + +import uvicorn +from fastapi import APIRouter, FastAPI, HTTPException +from pydantic import BaseModel +from starlette.middleware import Middleware +from starlette_context import context +from starlette_context.middleware import RawContextMiddleware + +from pr_agent.agent.pr_agent import PRAgent +from pr_agent.config_loader import get_settings, global_settings +from pr_agent.log import get_logger, setup_logger + +setup_logger() +router = APIRouter() + + +class Action(str, Enum): + review = "review" + describe = "describe" + ask = "ask" + improve = "improve" + reflect = "reflect" + answer = "answer" + + +class Item(BaseModel): + refspec: str + project: str + msg: str + + +@router.post("/api/v1/gerrit/{action}") +async def handle_gerrit_request(action: Action, item: Item): + get_logger().debug("Received a Gerrit request") + context["settings"] = copy.deepcopy(global_settings) + + if action == Action.ask: + if not item.msg: + return HTTPException( + status_code=400, + detail="msg is required for ask command" + ) + await PRAgent().handle_request( + f"{item.project}:{item.refspec}", + f"/{item.msg.strip()}" + ) + + +async def get_body(request): + try: + body = await request.json() + except JSONDecodeError as e: + get_logger().error("Error parsing request body", e) + return {} + return body + + +@router.get("/") +async def root(): + return {"status": "ok"} + + +def start(): + # to prevent adding help messages with the output + get_settings().set("CONFIG.CLI_MODE", True) + middleware = [Middleware(RawContextMiddleware)] + app = FastAPI(middleware=middleware) + app.include_router(router) + + uvicorn.run(app, host="0.0.0.0", port=3000) + + +if __name__ == '__main__': + start() diff --git a/pr_agent/servers/github_action_runner.py b/pr_agent/servers/github_action_runner.py index fbf4f89c9..c9d23456f 100644 --- a/pr_agent/servers/github_action_runner.py +++ b/pr_agent/servers/github_action_runner.py @@ -1,22 +1,43 @@ import asyncio import json import os +from typing import Union from pr_agent.agent.pr_agent import PRAgent from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers.utils import apply_repo_settings +from pr_agent.log import get_logger +from pr_agent.servers.github_app import handle_line_comments +from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions +from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_reviewer import PRReviewer +def is_true(value: Union[str, bool]) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.lower() == 'true' + return False + + +def get_setting_or_env(key: str, default: Union[str, bool] = None) -> Union[str, bool]: + try: + value = get_settings().get(key, default) + except AttributeError: # TBD still need to debug why this happens on GitHub Actions + value = os.getenv(key, None) or os.getenv(key.upper(), None) or os.getenv(key.lower(), None) or default + return value + + async def run_action(): # Get environment variables GITHUB_EVENT_NAME = os.environ.get('GITHUB_EVENT_NAME') GITHUB_EVENT_PATH = os.environ.get('GITHUB_EVENT_PATH') - OPENAI_KEY = os.environ.get('OPENAI_KEY') - OPENAI_ORG = os.environ.get('OPENAI_ORG') + OPENAI_KEY = os.environ.get('OPENAI_KEY') or os.environ.get('OPENAI.KEY') + OPENAI_ORG = os.environ.get('OPENAI_ORG') or os.environ.get('OPENAI.ORG') GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') - get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) - + # get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) # Check if required environment variables are set if not GITHUB_EVENT_NAME: @@ -25,19 +46,22 @@ async def run_action(): if not GITHUB_EVENT_PATH: print("GITHUB_EVENT_PATH not set") return - if not OPENAI_KEY: - print("OPENAI_KEY not set") - return if not GITHUB_TOKEN: print("GITHUB_TOKEN not set") return # Set the environment variables in the settings - get_settings().set("OPENAI.KEY", OPENAI_KEY) + if OPENAI_KEY: + get_settings().set("OPENAI.KEY", OPENAI_KEY) + else: + # Might not be set if the user is using models not from OpenAI + print("OPENAI_KEY not set") if OPENAI_ORG: get_settings().set("OPENAI.ORG", OPENAI_ORG) get_settings().set("GITHUB.USER_TOKEN", GITHUB_TOKEN) get_settings().set("GITHUB.DEPLOYMENT_TYPE", "user") + enable_output = get_setting_or_env("GITHUB_ACTION_CONFIG.ENABLE_OUTPUT", True) + get_settings().set("GITHUB_ACTION_CONFIG.ENABLE_OUTPUT", enable_output) # Load the event payload try: @@ -47,27 +71,90 @@ async def run_action(): print(f"Failed to parse JSON: {e}") return - # Handle pull request event + try: + get_logger().info("Applying repo settings") + pr_url = event_payload.get("pull_request", {}).get("html_url") + if pr_url: + apply_repo_settings(pr_url) + get_logger().info(f"enable_custom_labels: {get_settings().config.enable_custom_labels}") + except Exception as e: + get_logger().info(f"github action: failed to apply repo settings: {e}") + + # Handle pull request opened event if GITHUB_EVENT_NAME == "pull_request": action = event_payload.get("action") - if action in ["opened", "reopened"]: + + # Retrieve the list of actions from the configuration + pr_actions = get_settings().get("GITHUB_ACTION_CONFIG.PR_ACTIONS", ["opened", "reopened", "ready_for_review", "review_requested"]) + + if action in pr_actions: pr_url = event_payload.get("pull_request", {}).get("url") if pr_url: - await PRReviewer(pr_url).run() + # legacy - supporting both GITHUB_ACTION and GITHUB_ACTION_CONFIG + auto_review = get_setting_or_env("GITHUB_ACTION.AUTO_REVIEW", None) + if auto_review is None: + auto_review = get_setting_or_env("GITHUB_ACTION_CONFIG.AUTO_REVIEW", None) + auto_describe = get_setting_or_env("GITHUB_ACTION.AUTO_DESCRIBE", None) + if auto_describe is None: + auto_describe = get_setting_or_env("GITHUB_ACTION_CONFIG.AUTO_DESCRIBE", None) + auto_improve = get_setting_or_env("GITHUB_ACTION.AUTO_IMPROVE", None) + if auto_improve is None: + auto_improve = get_setting_or_env("GITHUB_ACTION_CONFIG.AUTO_IMPROVE", None) + + # Set the configuration for auto actions + get_settings().config.is_auto_command = True # Set the flag to indicate that the command is auto + get_settings().pr_description.final_update_message = False # No final update message when auto_describe is enabled + get_logger().info(f"Running auto actions: auto_describe={auto_describe}, auto_review={auto_review}, auto_improve={auto_improve}") + + # invoke by default all three tools + if auto_describe is None or is_true(auto_describe): + await PRDescription(pr_url).run() + if auto_review is None or is_true(auto_review): + await PRReviewer(pr_url).run() + if auto_improve is None or is_true(auto_improve): + await PRCodeSuggestions(pr_url).run() + else: + get_logger().info(f"Skipping action: {action}") # Handle issue comment event - elif GITHUB_EVENT_NAME == "issue_comment": + elif GITHUB_EVENT_NAME == "issue_comment" or GITHUB_EVENT_NAME == "pull_request_review_comment": action = event_payload.get("action") if action in ["created", "edited"]: comment_body = event_payload.get("comment", {}).get("body") + try: + if GITHUB_EVENT_NAME == "pull_request_review_comment": + if '/ask' in comment_body: + comment_body = handle_line_comments(event_payload, comment_body) + except Exception as e: + get_logger().error(f"Failed to handle line comments: {e}") + return if comment_body: - pr_url = event_payload.get("issue", {}).get("pull_request", {}).get("url") - if pr_url: + is_pr = False + disable_eyes = False + # check if issue is pull request + if event_payload.get("issue", {}).get("pull_request"): + url = event_payload.get("issue", {}).get("pull_request", {}).get("url") + is_pr = True + elif event_payload.get("comment", {}).get("pull_request_url"): # for 'pull_request_review_comment + url = event_payload.get("comment", {}).get("pull_request_url") + is_pr = True + disable_eyes = True + else: + url = event_payload.get("issue", {}).get("url") + + if url: body = comment_body.strip().lower() comment_id = event_payload.get("comment", {}).get("id") - provider = get_git_provider()(pr_url=pr_url) - await PRAgent().handle_request(pr_url, body, notify=lambda: provider.add_eyes_reaction(comment_id)) + provider = get_git_provider()(pr_url=url) + if is_pr: + await PRAgent().handle_request( + url, body, notify=lambda: provider.add_eyes_reaction( + comment_id, disable_eyes=disable_eyes + ) + ) + else: + await PRAgent().handle_request(url, body) if __name__ == '__main__': - asyncio.run(run_action()) \ No newline at end of file + asyncio.run(run_action()) diff --git a/pr_agent/servers/github_app.py b/pr_agent/servers/github_app.py index 18943ae85..00da88e35 100644 --- a/pr_agent/servers/github_app.py +++ b/pr_agent/servers/github_app.py @@ -1,101 +1,391 @@ +import asyncio.locks import copy -import logging -import sys -from typing import Any, Dict +import os +import re +import uuid +from typing import Any, Dict, Tuple import uvicorn from fastapi import APIRouter, FastAPI, HTTPException, Request, Response +from starlette.background import BackgroundTasks from starlette.middleware import Middleware from starlette_context import context from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent +from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings, global_settings -from pr_agent.git_providers import get_git_provider -from pr_agent.servers.utils import verify_signature +from pr_agent.git_providers import get_git_provider, get_git_provider_with_context +from pr_agent.git_providers.git_provider import IncrementalPR +from pr_agent.git_providers.utils import apply_repo_settings +from pr_agent.identity_providers import get_identity_provider +from pr_agent.identity_providers.identity_provider import Eligibility +from pr_agent.log import LoggingFormat, get_logger, setup_logger +from pr_agent.servers.utils import DefaultDictWithTimeout, verify_signature -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +setup_logger(fmt=LoggingFormat.JSON, level="DEBUG") +base_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +build_number_path = os.path.join(base_path, "build_number.txt") +if os.path.exists(build_number_path): + with open(build_number_path) as f: + build_number = f.read().strip() +else: + build_number = "unknown" router = APIRouter() @router.post("/api/v1/github_webhooks") -async def handle_github_webhooks(request: Request, response: Response): +async def handle_github_webhooks(background_tasks: BackgroundTasks, request: Request, response: Response): """ Receives and processes incoming GitHub webhook requests. Verifies the request signature, parses the request body, and passes it to the handle_request function for further processing. """ - logging.debug("Received a GitHub webhook") + get_logger().debug("Received a GitHub webhook") body = await get_body(request) - logging.debug(f'Request body:\n{body}') installation_id = body.get("installation", {}).get("id") context["installation_id"] = installation_id context["settings"] = copy.deepcopy(global_settings) - - return await handle_request(body) + context["git_provider"] = {} + background_tasks.add_task(handle_request, body, event=request.headers.get("X-GitHub-Event", None)) + return {} @router.post("/api/v1/marketplace_webhooks") async def handle_marketplace_webhooks(request: Request, response: Response): body = await get_body(request) - logging.info(f'Request body:\n{body}') + get_logger().info(f'Request body:\n{body}') + async def get_body(request): try: body = await request.json() except Exception as e: - logging.error("Error parsing request body", e) + get_logger().error("Error parsing request body", e) raise HTTPException(status_code=400, detail="Error parsing request body") from e - body_bytes = await request.body() - signature_header = request.headers.get('x-hub-signature-256', None) webhook_secret = getattr(get_settings().github, 'webhook_secret', None) if webhook_secret: + body_bytes = await request.body() + signature_header = request.headers.get('x-hub-signature-256', None) verify_signature(body_bytes, webhook_secret, signature_header) return body +_duplicate_push_triggers = DefaultDictWithTimeout(ttl=get_settings().github_app.push_trigger_pending_tasks_ttl) +_pending_task_duplicate_push_conditions = DefaultDictWithTimeout(asyncio.locks.Condition, ttl=get_settings().github_app.push_trigger_pending_tasks_ttl) + +async def handle_comments_on_pr(body: Dict[str, Any], + event: str, + sender: str, + sender_id: str, + action: str, + log_context: Dict[str, Any], + agent: PRAgent): + if "comment" not in body: + return {} + comment_body = body.get("comment", {}).get("body") + if comment_body and isinstance(comment_body, str) and not comment_body.lstrip().startswith("/"): + if '/ask' in comment_body and comment_body.strip().startswith('> ![image]'): + comment_body_split = comment_body.split('/ask') + comment_body = '/ask' + comment_body_split[1] +' \n' +comment_body_split[0].strip().lstrip('>') + get_logger().info(f"Reformatting comment_body so command is at the beginning: {comment_body}") + else: + get_logger().info("Ignoring comment not starting with /") + return {} + disable_eyes = False + if "issue" in body and "pull_request" in body["issue"] and "url" in body["issue"]["pull_request"]: + api_url = body["issue"]["pull_request"]["url"] + elif "comment" in body and "pull_request_url" in body["comment"]: + api_url = body["comment"]["pull_request_url"] + try: + if ('/ask' in comment_body and + 'subject_type' in body["comment"] and body["comment"]["subject_type"] == "line"): + # comment on a code line in the "files changed" tab + comment_body = handle_line_comments(body, comment_body) + disable_eyes = True + except Exception as e: + get_logger().error(f"Failed to handle line comments: {e}") + else: + return {} + log_context["api_url"] = api_url + comment_id = body.get("comment", {}).get("id") + provider = get_git_provider_with_context(pr_url=api_url) + with get_logger().contextualize(**log_context): + if get_identity_provider().verify_eligibility("github", sender_id, api_url) is not Eligibility.NOT_ELIGIBLE: + get_logger().info(f"Processing comment on PR {api_url=}, comment_body={comment_body}") + await agent.handle_request(api_url, comment_body, + notify=lambda: provider.add_eyes_reaction(comment_id, disable_eyes=disable_eyes)) + else: + get_logger().info(f"User {sender=} is not eligible to process comment on PR {api_url=}") + +async def handle_new_pr_opened(body: Dict[str, Any], + event: str, + sender: str, + sender_id: str, + action: str, + log_context: Dict[str, Any], + agent: PRAgent): + title = body.get("pull_request", {}).get("title", "") + + pull_request, api_url = _check_pull_request_event(action, body, log_context) + if not (pull_request and api_url): + get_logger().info(f"Invalid PR event: {action=} {api_url=}") + return {} + if action in get_settings().github_app.handle_pr_actions: # ['opened', 'reopened', 'ready_for_review'] + # logic to ignore PRs with specific titles (e.g. "[Auto] ...") + apply_repo_settings(api_url) + if get_identity_provider().verify_eligibility("github", sender_id, api_url) is not Eligibility.NOT_ELIGIBLE: + await _perform_auto_commands_github("pr_commands", agent, body, api_url, log_context) + else: + get_logger().info(f"User {sender=} is not eligible to process PR {api_url=}") + +async def handle_push_trigger_for_new_commits(body: Dict[str, Any], + event: str, + sender: str, + sender_id: str, + action: str, + log_context: Dict[str, Any], + agent: PRAgent): + pull_request, api_url = _check_pull_request_event(action, body, log_context) + if not (pull_request and api_url): + return {} + + apply_repo_settings(api_url) # we need to apply the repo settings to get the correct settings for the PR. This is quite expensive - a call to the git provider is made for each PR event. + if not get_settings().github_app.handle_push_trigger: + return {} + + # TODO: do we still want to get the list of commits to filter bot/merge commits? + before_sha = body.get("before") + after_sha = body.get("after") + merge_commit_sha = pull_request.get("merge_commit_sha") + if before_sha == after_sha: + return {} + if get_settings().github_app.push_trigger_ignore_merge_commits and after_sha == merge_commit_sha: + return {} + + # Prevent triggering multiple times for subsequent push triggers when one is enough: + # The first push will trigger the processing, and if there's a second push in the meanwhile it will wait. + # Any more events will be discarded, because they will all trigger the exact same processing on the PR. + # We let the second event wait instead of discarding it because while the first event was being processed, + # more commits may have been pushed that led to the subsequent events, + # so we keep just one waiting as a delegate to trigger the processing for the new commits when done waiting. + current_active_tasks = _duplicate_push_triggers.setdefault(api_url, 0) + max_active_tasks = 2 if get_settings().github_app.push_trigger_pending_tasks_backlog else 1 + if current_active_tasks < max_active_tasks: + # first task can enter, and second tasks too if backlog is enabled + get_logger().info( + f"Continue processing push trigger for {api_url=} because there are {current_active_tasks} active tasks" + ) + _duplicate_push_triggers[api_url] += 1 + else: + get_logger().info( + f"Skipping push trigger for {api_url=} because another event already triggered the same processing" + ) + return {} + async with _pending_task_duplicate_push_conditions[api_url]: + if current_active_tasks == 1: + # second task waits + get_logger().info( + f"Waiting to process push trigger for {api_url=} because the first task is still in progress" + ) + await _pending_task_duplicate_push_conditions[api_url].wait() + get_logger().info(f"Finished waiting to process push trigger for {api_url=} - continue with flow") + + try: + if get_identity_provider().verify_eligibility("github", sender_id, api_url) is not Eligibility.NOT_ELIGIBLE: + get_logger().info(f"Performing incremental review for {api_url=} because of {event=} and {action=}") + await _perform_auto_commands_github("push_commands", agent, body, api_url, log_context) + + finally: + # release the waiting task block + async with _pending_task_duplicate_push_conditions[api_url]: + _pending_task_duplicate_push_conditions[api_url].notify(1) + _duplicate_push_triggers[api_url] -= 1 + + +def handle_closed_pr(body, event, action, log_context): + pull_request = body.get("pull_request", {}) + is_merged = pull_request.get("merged", False) + if not is_merged: + return + api_url = pull_request.get("url", "") + pr_statistics = get_git_provider()(pr_url=api_url).calc_pr_statistics(pull_request) + log_context["api_url"] = api_url + get_logger().info("PR-Agent statistics for closed PR", analytics=True, pr_statistics=pr_statistics, **log_context) + +def get_log_context(body, event, action, build_number): + sender = "" + sender_id = "" + sender_type = "" + try: + sender = body.get("sender", {}).get("login") + sender_id = body.get("sender", {}).get("id") + sender_type = body.get("sender", {}).get("type") + repo = body.get("repository", {}).get("full_name", "") + git_org = body.get("organization", {}).get("login", "") + installation_id = body.get("installation", {}).get("id", "") + app_name = get_settings().get("CONFIG.APP_NAME", "Unknown") + log_context = {"action": action, "event": event, "sender": sender, "server_type": "github_app", + "request_id": uuid.uuid4().hex, "build_number": build_number, "app_name": app_name, + "repo": repo, "git_org": git_org, "installation_id": installation_id} + except Exception as e: + get_logger().error("Failed to get log context", e) + log_context = {} + return log_context, sender, sender_id, sender_type -async def handle_request(body: Dict[str, Any]): + +def is_bot_user(sender, sender_type): + try: + # logic to ignore PRs opened by bot + if get_settings().get("GITHUB_APP.IGNORE_BOT_PR", False) and sender_type == "Bot": + if 'pr-agent' not in sender: + get_logger().info(f"Ignoring PR from '{sender=}' because it is a bot") + return True + except Exception as e: + get_logger().error(f"Failed 'is_bot_user' logic: {e}") + return False + + +def should_process_pr_logic(sender_type, sender, body) -> bool: + try: + pull_request = body.get("pull_request", {}) + title = pull_request.get("title", "") + pr_labels = pull_request.get("labels", []) + source_branch = pull_request.get("head", {}).get("ref", "") + target_branch = pull_request.get("base", {}).get("ref", "") + + # logic to ignore PRs with specific titles + if title: + ignore_pr_title_re = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) + if not isinstance(ignore_pr_title_re, list): + ignore_pr_title_re = [ignore_pr_title_re] + if ignore_pr_title_re and any(re.search(regex, title) for regex in ignore_pr_title_re): + get_logger().info(f"Ignoring PR with title '{title}' due to config.ignore_pr_title setting") + return False + + # logic to ignore PRs with specific labels or source branches or target branches. + ignore_pr_labels = get_settings().get("CONFIG.IGNORE_PR_LABELS", []) + if pr_labels and ignore_pr_labels: + labels = [label['name'] for label in pr_labels] + if any(label in ignore_pr_labels for label in labels): + labels_str = ", ".join(labels) + get_logger().info(f"Ignoring PR with labels '{labels_str}' due to config.ignore_pr_labels settings") + return False + + ignore_pr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) + ignore_pr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) + if pull_request and (ignore_pr_source_branches or ignore_pr_target_branches): + if any(re.search(regex, source_branch) for regex in ignore_pr_source_branches): + get_logger().info( + f"Ignoring PR with source branch '{source_branch}' due to config.ignore_pr_source_branches settings") + return False + if any(re.search(regex, target_branch) for regex in ignore_pr_target_branches): + get_logger().info( + f"Ignoring PR with target branch '{target_branch}' due to config.ignore_pr_target_branches settings") + return False + except Exception as e: + get_logger().error(f"Failed 'should_process_pr_logic': {e}") + return True + + +async def handle_request(body: Dict[str, Any], event: str): """ Handle incoming GitHub webhook requests. Args: body: The request body. + event: The GitHub event type (e.g. "pull_request", "issue_comment", etc.). """ - action = body.get("action") + action = body.get("action") # "created", "opened", "reopened", "ready_for_review", "review_requested", "synchronize" if not action: return {} agent = PRAgent() + log_context, sender, sender_id, sender_type = get_log_context(body, event, action, build_number) - if action == 'created': - if "comment" not in body: - return {} - comment_body = body.get("comment", {}).get("body") - sender = body.get("sender", {}).get("login") - if sender and 'bot' in sender: - return {} - if "issue" not in body or "pull_request" not in body["issue"]: + # logic to ignore PRs opened by bot, PRs with specific titles, labels, source branches, or target branches + if is_bot_user(sender, sender_type): + return {} + if action != 'created' and 'check_run' not in body: + if not should_process_pr_logic(sender_type, sender, body): return {} - pull_request = body["issue"]["pull_request"] - api_url = pull_request.get("url") - comment_id = body.get("comment", {}).get("id") - provider = get_git_provider()(pr_url=api_url) - await agent.handle_request(api_url, comment_body, notify=lambda: provider.add_eyes_reaction(comment_id)) + if 'check_run' in body: # handle failed checks + # get_logger().debug(f'Request body', artifact=body, event=event) # added inside handle_checks + pass + # handle comments on PRs + elif action == 'created': + get_logger().debug(f'Request body', artifact=body, event=event) + await handle_comments_on_pr(body, event, sender, sender_id, action, log_context, agent) + # handle new PRs + elif event == 'pull_request' and action != 'synchronize' and action != 'closed': + get_logger().debug(f'Request body', artifact=body, event=event) + await handle_new_pr_opened(body, event, sender, sender_id, action, log_context, agent) + elif event == "issue_comment" and 'edited' in action: + pass # handle_checkbox_clicked + # handle pull_request event with synchronize action - "push trigger" for new commits + elif event == 'pull_request' and action == 'synchronize': + await handle_push_trigger_for_new_commits(body, event, sender,sender_id, action, log_context, agent) + elif event == 'pull_request' and action == 'closed': + if get_settings().get("CONFIG.ANALYTICS_FOLDER", ""): + handle_closed_pr(body, event, action, log_context) + else: + get_logger().info(f"event {event=} action {action=} does not require any handling") + return {} - elif action == "opened" or 'reopened' in action: - pull_request = body.get("pull_request") - if not pull_request: - return {} - api_url = pull_request.get("url") - if not api_url: - return {} - await agent.handle_request(api_url, "/review") - return {} +def handle_line_comments(body: Dict, comment_body: [str, Any]) -> str: + if not comment_body: + return "" + start_line = body["comment"]["start_line"] + end_line = body["comment"]["line"] + start_line = end_line if not start_line else start_line + question = comment_body.replace('/ask', '').strip() + diff_hunk = body["comment"]["diff_hunk"] + get_settings().set("ask_diff_hunk", diff_hunk) + path = body["comment"]["path"] + side = body["comment"]["side"] + comment_id = body["comment"]["id"] + if '/ask' in comment_body: + comment_body = f"/ask_line --line_start={start_line} --line_end={end_line} --side={side} --file_name={path} --comment_id={comment_id} {question}" + return comment_body + + +def _check_pull_request_event(action: str, body: dict, log_context: dict) -> Tuple[Dict[str, Any], str]: + invalid_result = {}, "" + pull_request = body.get("pull_request") + if not pull_request: + return invalid_result + api_url = pull_request.get("url") + if not api_url: + return invalid_result + log_context["api_url"] = api_url + if pull_request.get("draft", True) or pull_request.get("state") != "open": + return invalid_result + if action in ("review_requested", "synchronize") and pull_request.get("created_at") == pull_request.get("updated_at"): + # avoid double reviews when opening a PR for the first time + return invalid_result + return pull_request, api_url + + +async def _perform_auto_commands_github(commands_conf: str, agent: PRAgent, body: dict, api_url: str, + log_context: dict): + apply_repo_settings(api_url) + commands = get_settings().get(f"github_app.{commands_conf}") + if not commands: + get_logger().info(f"New PR, but no auto commands configured") + return + get_settings().set("config.is_auto_command", True) + for command in commands: + split_command = command.split(" ") + command = split_command[0] + args = split_command[1:] + other_args = update_settings_from_args(args) + new_command = ' '.join([command] + other_args) + get_logger().info(f"{commands_conf}. Performing auto command '{new_command}', for {api_url=}") + await agent.handle_request(api_url, new_command) @router.get("/") @@ -103,16 +393,18 @@ async def root(): return {"status": "ok"} -def start(): +if get_settings().github_app.override_deployment_type: # Override the deployment type to app get_settings().set("GITHUB.DEPLOYMENT_TYPE", "app") - get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) - middleware = [Middleware(RawContextMiddleware)] - app = FastAPI(middleware=middleware) - app.include_router(router) +# get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) +middleware = [Middleware(RawContextMiddleware)] +app = FastAPI(middleware=middleware) +app.include_router(router) + - uvicorn.run(app, host="0.0.0.0", port=3000) +def start(): + uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "3000"))) if __name__ == '__main__': - start() \ No newline at end of file + start() diff --git a/pr_agent/servers/github_polling.py b/pr_agent/servers/github_polling.py index fdd6642dc..de1a9f529 100644 --- a/pr_agent/servers/github_polling.py +++ b/pr_agent/servers/github_polling.py @@ -1,23 +1,34 @@ import asyncio -import logging -import sys +import multiprocessing +from collections import deque +import traceback from datetime import datetime, timezone - +import time +import requests import aiohttp from pr_agent.agent.pr_agent import PRAgent from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider -from pr_agent.servers.help import bot_help_text +from pr_agent.log import LoggingFormat, get_logger, setup_logger -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +setup_logger(fmt=LoggingFormat.JSON, level="DEBUG") NOTIFICATION_URL = "https://api.github.com/notifications" +async def mark_notification_as_read(headers, notification, session): + async with session.patch( + f"https://api.github.com/notifications/threads/{notification['id']}", + headers=headers) as mark_read_response: + if mark_read_response.status != 205: + get_logger().error( + f"Failed to mark notification as read. Status code: {mark_read_response.status}") + + def now() -> str: """ Get the current UTC time in ISO 8601 format. - + Returns: str: The current UTC time in ISO 8601 format. """ @@ -25,6 +36,108 @@ def now() -> str: now_utc = now_utc.replace("+00:00", "Z") return now_utc +async def async_handle_request(pr_url, rest_of_comment, comment_id, git_provider): + agent = PRAgent() + success = await agent.handle_request( + pr_url, + rest_of_comment, + notify=lambda: git_provider.add_eyes_reaction(comment_id) + ) + return success + +def run_handle_request(pr_url, rest_of_comment, comment_id, git_provider): + return asyncio.run(async_handle_request(pr_url, rest_of_comment, comment_id, git_provider)) + + +def process_comment_sync(pr_url, rest_of_comment, comment_id): + try: + # Run the async handle_request in a separate function + git_provider = get_git_provider()(pr_url=pr_url) + success = run_handle_request(pr_url, rest_of_comment, comment_id, git_provider) + except Exception as e: + get_logger().error(f"Error processing comment: {e}", artifact={"traceback": traceback.format_exc()}) + + +async def process_comment(pr_url, rest_of_comment, comment_id): + try: + git_provider = get_git_provider()(pr_url=pr_url) + git_provider.set_pr(pr_url) + agent = PRAgent() + success = await agent.handle_request( + pr_url, + rest_of_comment, + notify=lambda: git_provider.add_eyes_reaction(comment_id) + ) + get_logger().info(f"Finished processing comment for PR: {pr_url}") + except Exception as e: + get_logger().error(f"Error processing comment: {e}", artifact={"traceback": traceback.format_exc()}) + +async def is_valid_notification(notification, headers, handled_ids, session, user_id): + try: + if 'reason' in notification and notification['reason'] == 'mention': + if 'subject' in notification and notification['subject']['type'] == 'PullRequest': + pr_url = notification['subject']['url'] + latest_comment = notification['subject']['latest_comment_url'] + if not latest_comment or not isinstance(latest_comment, str): + get_logger().debug(f"no latest_comment") + return False, handled_ids + async with session.get(latest_comment, headers=headers) as comment_response: + check_prev_comments = False + if comment_response.status == 200: + comment = await comment_response.json() + if 'id' in comment: + if comment['id'] in handled_ids: + get_logger().debug(f"comment['id'] in handled_ids") + return False, handled_ids + else: + handled_ids.add(comment['id']) + if 'user' in comment and 'login' in comment['user']: + if comment['user']['login'] == user_id: + get_logger().debug(f"comment['user']['login'] == user_id") + check_prev_comments = True + comment_body = comment.get('body', '') + if not comment_body: + get_logger().debug(f"no comment_body") + check_prev_comments = True + else: + user_tag = "@" + user_id + if user_tag not in comment_body: + get_logger().debug(f"user_tag not in comment_body") + check_prev_comments = True + else: + get_logger().info(f"Polling, pr_url: {pr_url}", + artifact={"comment": comment_body}) + + if not check_prev_comments: + return True, handled_ids, comment, comment_body, pr_url, user_tag + else: # we could not find the user tag in the latest comment. Check previous comments + # get all comments in the PR + requests_url = f"{pr_url}/comments".replace("pulls", "issues") + comments_response = requests.get(requests_url, headers=headers) + comments = comments_response.json()[::-1] + max_comment_to_scan = 4 + for comment in comments[:max_comment_to_scan]: + if 'user' in comment and 'login' in comment['user']: + if comment['user']['login'] == user_id: + continue + comment_body = comment.get('body', '') + if not comment_body: + continue + if user_tag in comment_body: + get_logger().info("found user tag in previous comments") + get_logger().info(f"Polling, pr_url: {pr_url}", + artifact={"comment": comment_body}) + return True, handled_ids, comment, comment_body, pr_url, user_tag + + get_logger().error(f"Failed to fetch comments for PR: {pr_url}") + return False, handled_ids + + return False, handled_ids + except Exception as e: + get_logger().error(f"Error processing notification: {e}", artifact={"traceback": traceback.format_exc()}) + return False, handled_ids + + async def polling_loop(): """ @@ -35,8 +148,8 @@ async def polling_loop(): last_modified = [None] git_provider = get_git_provider()() user_id = git_provider.get_user_id() - agent = PRAgent() get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) + get_settings().set("pr_description.publish_description_as_comment", True) try: deployment_type = get_settings().github.deployment_type @@ -74,46 +187,53 @@ async def polling_loop(): notifications = await response.json() if not notifications: continue + get_logger().info(f"Received {len(notifications)} notifications") + task_queue = deque() for notification in notifications: + if not notification: + continue + # mark notification as read + await mark_notification_as_read(headers, notification, session) + handled_ids.add(notification['id']) - if 'reason' in notification and notification['reason'] == 'mention': - if 'subject' in notification and notification['subject']['type'] == 'PullRequest': - pr_url = notification['subject']['url'] - latest_comment = notification['subject']['latest_comment_url'] - async with session.get(latest_comment, headers=headers) as comment_response: - if comment_response.status == 200: - comment = await comment_response.json() - if 'id' in comment: - if comment['id'] in handled_ids: - continue - else: - handled_ids.add(comment['id']) - if 'user' in comment and 'login' in comment['user']: - if comment['user']['login'] == user_id: - continue - comment_body = comment['body'] if 'body' in comment else '' - commenter_github_user = comment['user']['login'] \ - if 'user' in comment else '' - logging.info(f"Commenter: {commenter_github_user}\nComment: {comment_body}") - user_tag = "@" + user_id - if user_tag not in comment_body: - continue - rest_of_comment = comment_body.split(user_tag)[1].strip() - comment_id = comment['id'] - git_provider.set_pr(pr_url) - success = await agent.handle_request(pr_url, rest_of_comment, - notify=lambda: git_provider.add_eyes_reaction(comment_id)) # noqa E501 - if not success: - git_provider.set_pr(pr_url) - git_provider.publish_comment("### How to use PR-Agent\n" + - bot_help_text(user_id)) + output = await is_valid_notification(notification, headers, handled_ids, session, user_id) + if output[0]: + _, handled_ids, comment, comment_body, pr_url, user_tag = output + rest_of_comment = comment_body.split(user_tag)[1].strip() + comment_id = comment['id'] + + # Add to the task queue + get_logger().info( + f"Adding comment processing to task queue for PR, {pr_url}, comment_body: {comment_body}") + task_queue.append((process_comment_sync, (pr_url, rest_of_comment, comment_id))) + get_logger().info(f"Queued comment processing for PR: {pr_url}") + else: + get_logger().debug(f"Skipping comment processing for PR") + + max_allowed_parallel_tasks = 10 + if task_queue: + processes = [] + for i, (func, args) in enumerate(task_queue): # Create parallel tasks + p = multiprocessing.Process(target=func, args=args) + processes.append(p) + p.start() + if i > max_allowed_parallel_tasks: + get_logger().error( + f"Dropping {len(task_queue) - max_allowed_parallel_tasks} tasks from polling session") + break + task_queue.clear() + + # Dont wait for all processes to complete. Move on to the next iteration + # for p in processes: + # p.join() elif response.status != 304: print(f"Failed to fetch notifications. Status code: {response.status}") except Exception as e: - logging.error(f"Exception during processing of a notification: {e}") + get_logger().error(f"Polling exception during processing of a notification: {e}", + artifact={"traceback": traceback.format_exc()}) if __name__ == '__main__': - asyncio.run(polling_loop()) \ No newline at end of file + asyncio.run(polling_loop()) diff --git a/pr_agent/servers/gitlab_webhook.py b/pr_agent/servers/gitlab_webhook.py index c9b623f71..15d4fa418 100644 --- a/pr_agent/servers/gitlab_webhook.py +++ b/pr_agent/servers/gitlab_webhook.py @@ -1,45 +1,269 @@ -import logging +import copy +import re +import json +from datetime import datetime import uvicorn from fastapi import APIRouter, FastAPI, Request, status from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse from starlette.background import BackgroundTasks +from starlette.middleware import Middleware +from starlette_context import context +from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent -from pr_agent.config_loader import get_settings +from pr_agent.algo.utils import update_settings_from_args +from pr_agent.config_loader import get_settings, global_settings +from pr_agent.git_providers.utils import apply_repo_settings +from pr_agent.log import LoggingFormat, get_logger, setup_logger +from pr_agent.secret_providers import get_secret_provider -app = FastAPI() +setup_logger(fmt=LoggingFormat.JSON, level="DEBUG") router = APIRouter() +secret_provider = get_secret_provider() if get_settings().get("CONFIG.SECRET_PROVIDER") else None + + +async def get_mr_url_from_commit_sha(commit_sha, gitlab_token, project_id): + try: + import requests + headers = { + 'Private-Token': f'{gitlab_token}' + } + # API endpoint to find MRs containing the commit + gitlab_url = get_settings().get("GITLAB.URL", 'https://gitlab.com') + response = requests.get( + f'{gitlab_url}/api/v4/projects/{project_id}/repository/commits/{commit_sha}/merge_requests', + headers=headers + ) + merge_requests = response.json() + if merge_requests and response.status_code == 200: + pr_url = merge_requests[0]['web_url'] + return pr_url + else: + get_logger().info(f"No merge requests found for commit: {commit_sha}") + return None + except Exception as e: + get_logger().error(f"Failed to get MR url from commit sha: {e}") + return None + +async def handle_request(api_url: str, body: str, log_context: dict, sender_id: str): + log_context["action"] = body + log_context["event"] = "pull_request" if body == "/review" else "comment" + log_context["api_url"] = api_url + log_context["app_name"] = get_settings().get("CONFIG.APP_NAME", "Unknown") + + with get_logger().contextualize(**log_context): + await PRAgent().handle_request(api_url, body) + + +async def _perform_commands_gitlab(commands_conf: str, agent: PRAgent, api_url: str, + log_context: dict): + apply_repo_settings(api_url) + commands = get_settings().get(f"gitlab.{commands_conf}", {}) + get_settings().set("config.is_auto_command", True) + for command in commands: + try: + split_command = command.split(" ") + command = split_command[0] + args = split_command[1:] + other_args = update_settings_from_args(args) + new_command = ' '.join([command] + other_args) + get_logger().info(f"Performing command: {new_command}") + with get_logger().contextualize(**log_context): + await agent.handle_request(api_url, new_command) + except Exception as e: + get_logger().error(f"Failed to perform command {command}: {e}") + + +def is_bot_user(data) -> bool: + try: + # logic to ignore bot users (unlike Github, no direct flag for bot users in gitlab) + sender_name = data.get("user", {}).get("name", "unknown").lower() + bot_indicators = ['codium', 'bot_', 'bot-', '_bot', '-bot'] + if any(indicator in sender_name for indicator in bot_indicators): + get_logger().info(f"Skipping GitLab bot user: {sender_name}") + return True + except Exception as e: + get_logger().error(f"Failed 'is_bot_user' logic: {e}") + return False + + +def should_process_pr_logic(data, title) -> bool: + try: + # logic to ignore MRs for titles, labels and source, target branches. + ignore_mr_title = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) + ignore_mr_labels = get_settings().get("CONFIG.IGNORE_PR_LABELS", []) + ignore_mr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) + ignore_mr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) + + # + if ignore_mr_source_branches: + source_branch = data['object_attributes'].get('source_branch') + if any(re.search(regex, source_branch) for regex in ignore_mr_source_branches): + get_logger().info( + f"Ignoring MR with source branch '{source_branch}' due to gitlab.ignore_mr_source_branches settings") + return False + + if ignore_mr_target_branches: + target_branch = data['object_attributes'].get('target_branch') + if any(re.search(regex, target_branch) for regex in ignore_mr_target_branches): + get_logger().info( + f"Ignoring MR with target branch '{target_branch}' due to gitlab.ignore_mr_target_branches settings") + return False + + if ignore_mr_labels: + labels = [label['title'] for label in data['object_attributes'].get('labels', [])] + if any(label in ignore_mr_labels for label in labels): + labels_str = ", ".join(labels) + get_logger().info(f"Ignoring MR with labels '{labels_str}' due to gitlab.ignore_mr_labels settings") + return False + + if ignore_mr_title: + if any(re.search(regex, title) for regex in ignore_mr_title): + get_logger().info(f"Ignoring MR with title '{title}' due to gitlab.ignore_mr_title settings") + return False + except Exception as e: + get_logger().error(f"Failed 'should_process_pr_logic': {e}") + return True + @router.post("/webhook") async def gitlab_webhook(background_tasks: BackgroundTasks, request: Request): - data = await request.json() - if data.get('object_kind') == 'merge_request' and data['object_attributes'].get('action') in ['open', 'reopen']: - logging.info(f"A merge request has been opened: {data['object_attributes'].get('title')}") - url = data['object_attributes'].get('url') - background_tasks.add_task(PRAgent().handle_request, url, "/review") - elif data.get('object_kind') == 'note' and data['event_type'] == 'note': - if 'merge_request' in data: - mr = data['merge_request'] - url = mr.get('url') - body = data.get('object_attributes', {}).get('note') - background_tasks.add_task(PRAgent().handle_request, url, body) + start_time = datetime.now() + request_json = await request.json() + + async def inner(data: dict): + log_context = {"server_type": "gitlab_app"} + get_logger().debug("Received a GitLab webhook") + if request.headers.get("X-Gitlab-Token") and secret_provider: + request_token = request.headers.get("X-Gitlab-Token") + secret = secret_provider.get_secret(request_token) + if not secret: + get_logger().warning(f"Empty secret retrieved, request_token: {request_token}") + return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, + content=jsonable_encoder({"message": "unauthorized"})) + try: + secret_dict = json.loads(secret) + gitlab_token = secret_dict["gitlab_token"] + log_context["token_id"] = secret_dict.get("token_name", secret_dict.get("id", "unknown")) + context["settings"] = copy.deepcopy(global_settings) + context["settings"].gitlab.personal_access_token = gitlab_token + except Exception as e: + get_logger().error(f"Failed to validate secret {request_token}: {e}") + return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) + elif get_settings().get("GITLAB.SHARED_SECRET"): + secret = get_settings().get("GITLAB.SHARED_SECRET") + if not request.headers.get("X-Gitlab-Token") == secret: + get_logger().error("Failed to validate secret") + return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) + else: + get_logger().error("Failed to validate secret") + return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) + gitlab_token = get_settings().get("GITLAB.PERSONAL_ACCESS_TOKEN", None) + if not gitlab_token: + get_logger().error("No gitlab token found") + return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) + + get_logger().info("GitLab data", artifact=data) + sender = data.get("user", {}).get("username", "unknown") + sender_id = data.get("user", {}).get("id", "unknown") + + # ignore bot users + if is_bot_user(data): + return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) + if data.get('event_type') != 'note' and data.get('object_attributes', {}): # not a comment + # ignore MRs based on title, labels, source and target branches + if not should_process_pr_logic(data, data['object_attributes'].get('title')): + return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) + + log_context["sender"] = sender + if data.get('object_kind') == 'merge_request' and data['object_attributes'].get('action') in ['open', 'reopen']: + title = data['object_attributes'].get('title') + url = data['object_attributes'].get('url') + draft = data['object_attributes'].get('draft') + get_logger().info(f"New merge request: {url}") + if draft: + get_logger().info(f"Skipping draft MR: {url}") + return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) + + await _perform_commands_gitlab("pr_commands", PRAgent(), url, log_context) + elif data.get('object_kind') == 'note' and data.get('event_type') == 'note': # comment on MR + if 'merge_request' in data: + mr = data['merge_request'] + url = mr.get('url') + + get_logger().info(f"A comment has been added to a merge request: {url}") + body = data.get('object_attributes', {}).get('note') + if data.get('object_attributes', {}).get('type') == 'DiffNote' and '/ask' in body: # /ask_line + body = handle_ask_line(body, data) + + await handle_request(url, body, log_context, sender_id) + elif data.get('object_kind') == 'push' and data.get('event_name') == 'push': + try: + project_id = data['project_id'] + commit_sha = data['checkout_sha'] + url = await get_mr_url_from_commit_sha(commit_sha, gitlab_token, project_id) + if not url: + get_logger().info(f"No MR found for commit: {commit_sha}") + return JSONResponse(status_code=status.HTTP_200_OK, + content=jsonable_encoder({"message": "success"})) + + # we need first to apply_repo_settings + apply_repo_settings(url) + commands_on_push = get_settings().get(f"gitlab.push_commands", {}) + handle_push_trigger = get_settings().get(f"gitlab.handle_push_trigger", False) + if not commands_on_push or not handle_push_trigger: + get_logger().info("Push event, but no push commands found or push trigger is disabled") + return JSONResponse(status_code=status.HTTP_200_OK, + content=jsonable_encoder({"message": "success"})) + + get_logger().debug(f'A push event has been received: {url}') + await _perform_commands_gitlab("push_commands", PRAgent(), url, log_context) + except Exception as e: + get_logger().error(f"Failed to handle push event: {e}") + + background_tasks.add_task(inner, request_json) + end_time = datetime.now() + get_logger().info(f"Processing time: {end_time - start_time}", request=request_json) return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) -def start(): - gitlab_url = get_settings().get("GITLAB.URL", None) - if not gitlab_url: - raise ValueError("GITLAB.URL is not set") - gitlab_token = get_settings().get("GITLAB.PERSONAL_ACCESS_TOKEN", None) - if not gitlab_token: - raise ValueError("GITLAB.PERSONAL_ACCESS_TOKEN is not set") - get_settings().config.git_provider = "gitlab" - app = FastAPI() - app.include_router(router) +def handle_ask_line(body, data): + try: + line_range_ = data['object_attributes']['position']['line_range'] + # if line_range_['start']['type'] == 'new': + start_line = line_range_['start']['new_line'] + end_line = line_range_['end']['new_line'] + # else: + # start_line = line_range_['start']['old_line'] + # end_line = line_range_['end']['old_line'] + question = body.replace('/ask', '').strip() + path = data['object_attributes']['position']['new_path'] + side = 'RIGHT' # if line_range_['start']['type'] == 'new' else 'LEFT' + comment_id = data['object_attributes']["discussion_id"] + get_logger().info("Handling line comment") + body = f"/ask_line --line_start={start_line} --line_end={end_line} --side={side} --file_name={path} --comment_id={comment_id} {question}" + except Exception as e: + get_logger().error(f"Failed to handle ask line comment: {e}") + return body + + +@router.get("/") +async def root(): + return {"status": "ok"} +gitlab_url = get_settings().get("GITLAB.URL", None) +if not gitlab_url: + raise ValueError("GITLAB.URL is not set") +get_settings().config.git_provider = "gitlab" +middleware = [Middleware(RawContextMiddleware)] +app = FastAPI(middleware=middleware) +app.include_router(router) + + +def start(): uvicorn.run(app, host="0.0.0.0", port=3000) diff --git a/pr_agent/servers/gunicorn_config.py b/pr_agent/servers/gunicorn_config.py new file mode 100644 index 000000000..1b4034bf1 --- /dev/null +++ b/pr_agent/servers/gunicorn_config.py @@ -0,0 +1,191 @@ +import multiprocessing +import os + +# from prometheus_client import multiprocess + +# Sample Gunicorn configuration file. + +# +# Server socket +# +# bind - The socket to bind. +# +# A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'. +# An IP is a valid HOST. +# +# backlog - The number of pending connections. This refers +# to the number of clients that can be waiting to be +# served. Exceeding this number results in the client +# getting an error when attempting to connect. It should +# only affect servers under significant load. +# +# Must be a positive integer. Generally set in the 64-2048 +# range. +# + +# bind = '0.0.0.0:5000' +bind = '0.0.0.0:3000' +backlog = 2048 + +# +# Worker processes +# +# workers - The number of worker processes that this server +# should keep alive for handling requests. +# +# A positive integer generally in the 2-4 x $(NUM_CORES) +# range. You'll want to vary this a bit to find the best +# for your particular application's work load. +# +# worker_class - The type of workers to use. The default +# sync class should handle most 'normal' types of work +# loads. You'll want to read +# http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type +# for information on when you might want to choose one +# of the other worker classes. +# +# A string referring to a Python path to a subclass of +# gunicorn.workers.base.Worker. The default provided values +# can be seen at +# http://docs.gunicorn.org/en/latest/settings.html#worker-class +# +# worker_connections - For the eventlet and gevent worker classes +# this limits the maximum number of simultaneous clients that +# a single process can handle. +# +# A positive integer generally set to around 1000. +# +# timeout - If a worker does not notify the master process in this +# number of seconds it is killed and a new worker is spawned +# to replace it. +# +# Generally set to thirty seconds. Only set this noticeably +# higher if you're sure of the repercussions for sync workers. +# For the non sync workers it just means that the worker +# process is still communicating and is not tied to the length +# of time required to handle a single request. +# +# keepalive - The number of seconds to wait for the next request +# on a Keep-Alive HTTP connection. +# +# A positive integer. Generally set in the 1-5 seconds range. +# + +if os.getenv('GUNICORN_WORKERS', None): + workers = int(os.getenv('GUNICORN_WORKERS')) +else: + cores = multiprocessing.cpu_count() + workers = cores * 2 + 1 +worker_connections = 1000 +timeout = 240 +keepalive = 2 + +# +# spew - Install a trace function that spews every line of Python +# that is executed when running the server. This is the +# nuclear option. +# +# True or False +# + +spew = False + +# +# Server mechanics +# +# daemon - Detach the main Gunicorn process from the controlling +# terminal with a standard fork/fork sequence. +# +# True or False +# +# raw_env - Pass environment variables to the execution environment. +# +# pidfile - The path to a pid file to write +# +# A path string or None to not write a pid file. +# +# user - Switch worker processes to run as this user. +# +# A valid user id (as an integer) or the name of a user that +# can be retrieved with a call to pwd.getpwnam(value) or None +# to not change the worker process user. +# +# group - Switch worker process to run as this group. +# +# A valid group id (as an integer) or the name of a user that +# can be retrieved with a call to pwd.getgrnam(value) or None +# to change the worker processes group. +# +# umask - A mask for file permissions written by Gunicorn. Note that +# this affects unix socket permissions. +# +# A valid value for the os.umask(mode) call or a string +# compatible with int(value, 0) (0 means Python guesses +# the base, so values like "0", "0xFF", "0022" are valid +# for decimal, hex, and octal representations) +# +# tmp_upload_dir - A directory to store temporary request data when +# requests are read. This will most likely be disappearing soon. +# +# A path to a directory where the process owner can write. Or +# None to signal that Python should choose one on its own. +# + +daemon = False +raw_env = [] +pidfile = None +umask = 0 +user = None +group = None +tmp_upload_dir = None + +# +# Logging +# +# logfile - The path to a log file to write to. +# +# A path string. "-" means log to stdout. +# +# loglevel - The granularity of log output +# +# A string of "debug", "info", "warning", "error", "critical" +# + +errorlog = '-' +loglevel = 'info' +accesslog = None +access_log_format = '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s"' + +# +# Process naming +# +# proc_name - A base to use with setproctitle to change the way +# that Gunicorn processes are reported in the system process +# table. This affects things like 'ps' and 'top'. If you're +# going to be running more than one instance of Gunicorn you'll +# probably want to set a name to tell them apart. This requires +# that you install the setproctitle module. +# +# A string or None to choose a default of something like 'gunicorn'. +# + +proc_name = None + + +# +# Server hooks +# +# post_fork - Called just after a worker has been forked. +# +# A callable that takes a server and worker instance +# as arguments. +# +# pre_fork - Called just prior to forking the worker subprocess. +# +# A callable that accepts the same arguments as after_fork +# +# pre_exec - Called just prior to forking off a secondary +# master process during things like config reloading. +# +# A callable that takes a server instance as the sole argument. +# diff --git a/pr_agent/servers/help.py b/pr_agent/servers/help.py index 1ee7fc4d5..5578fb0f5 100644 --- a/pr_agent/servers/help.py +++ b/pr_agent/servers/help.py @@ -1,17 +1,203 @@ -commands_text = "> **/review [-i]**: Request a review of your Pull Request. For an incremental review, which only " \ - "considers changes since the last review, include the '-i' option.\n" \ - "> **/describe**: Modify the PR title and description based on the contents of the PR.\n" \ - "> **/improve**: Suggest improvements to the code in the PR. \n" \ - "> **/ask \\**: Pose a question about the PR.\n" \ - "> **/update_changelog**: Update the changelog based on the PR's contents.\n\n" \ - ">To edit any configuration parameter from **configuration.toml**, add --config_path=new_value\n" \ - ">For example: /review --pr_reviewer.extra_instructions=\"focus on the file: ...\" \n" \ - ">To list the possible configuration parameters, use the **/config** command.\n" \ +class HelpMessage: + @staticmethod + def get_general_commands_text(): + commands_text = "> - **/review**: Request a review of your Pull Request. \n" \ + "> - **/describe**: Update the PR title and description based on the contents of the PR. \n" \ + "> - **/improve [--extended]**: Suggest code improvements. Extended mode provides a higher quality feedback. \n" \ + "> - **/ask \\**: Ask a question about the PR. \n" \ + "> - **/update_changelog**: Update the changelog based on the PR's contents. \n" \ + "> - **/add_docs** ๐Ÿ’Ž: Generate docstring for new components introduced in the PR. \n" \ + "> - **/generate_labels** ๐Ÿ’Ž: Generate labels for the PR based on the PR's contents. \n" \ + "> - **/analyze** ๐Ÿ’Ž: Automatically analyzes the PR, and presents changes walkthrough for each component. \n\n" \ + ">See the [tools guide](https://pr-agent-docs.codium.ai/tools/) for more details.\n" \ + ">To list the possible configuration parameters, add a **/config** comment. \n" + return commands_text -def bot_help_text(user: str): - return f"> Tag me in a comment '@{user}' and add one of the following commands:\n" + commands_text + @staticmethod + def get_general_bot_help_text(): + output = f"> To invoke the PR-Agent, add a comment using one of the following commands: \n{HelpMessage.get_general_commands_text()} \n" + return output + @staticmethod + def get_review_usage_guide(): + output ="**Overview:**\n" + output +=("The `review` tool scans the PR code changes, and generates a PR review which includes several types of feedbacks, such as possible PR issues, security threats and relevant test in the PR. More feedbacks can be [added](https://pr-agent-docs.codium.ai/tools/review/#general-configurations) by configuring the tool.\n\n" + "The tool can be triggered [automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) every time a new PR is opened, or can be invoked manually by commenting on any PR.\n") + output +="""\ +- When commenting, to edit [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L23) related to the review tool (`pr_reviewer` section), use the following template: +``` +/review --pr_reviewer.some_config1=... --pr_reviewer.some_config2=... +``` +- With a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/), use the following template: +``` +[pr_reviewer] +some_config1=... +some_config2=... +``` + """ -actions_help_text = "> To invoke the PR-Agent, add a comment using one of the following commands:\n" + \ - commands_text + output += f"\n\nSee the review [usage page](https://pr-agent-docs.codium.ai/tools/review/) for a comprehensive guide on using this tool.\n\n" + + return output + + + + @staticmethod + def get_describe_usage_guide(): + output = "**Overview:**\n" + output += "The `describe` tool scans the PR code changes, and generates a description for the PR - title, type, summary, walkthrough and labels. " + output += "The tool can be triggered [automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) every time a new PR is opened, or can be invoked manually by commenting on a PR.\n" + output += """\ + +When commenting, to edit [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L46) related to the describe tool (`pr_description` section), use the following template: +``` +/describe --pr_description.some_config1=... --pr_description.some_config2=... +``` +With a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/), use the following template: +``` +[pr_description] +some_config1=... +some_config2=... +``` +""" + output += "\n\n" + + # automation + output += "\n\n" + + # custom labels + output += "\n\n" + + # Inline File Walkthrough + output += "
    Enabling\\disabling automation
    \n\n" + output += """\ +- When you first install the app, the [default mode](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) for the describe tool is: +``` +pr_commands = ["/describe", ...] +``` +meaning the `describe` tool will run automatically on every PR. + +- Markers are an alternative way to control the generated description, to give maximal control to the user. If you set: +``` +pr_commands = ["/describe --pr_description.use_description_markers=true", ...] +``` +the tool will replace every marker of the form `pr_agent:marker_name` in the PR description with the relevant content, where `marker_name` is one of the following: + - `type`: the PR type. + - `summary`: the PR summary. + - `walkthrough`: the PR walkthrough. + +Note that when markers are enabled, if the original PR description does not contain any markers, the tool will not alter the description at all. + +""" + output += "\n\n
    Custom labels
    \n\n" + output += """\ +The default labels of the `describe` tool are quite generic: [`Bug fix`, `Tests`, `Enhancement`, `Documentation`, `Other`]. + +If you specify [custom labels](https://pr-agent-docs.codium.ai/tools/describe/#handle-custom-labels-from-the-repos-labels-page) in the repo's labels page or via configuration file, you can get tailored labels for your use cases. +Examples for custom labels: +- `Main topic:performance` - pr_agent:The main topic of this PR is performance +- `New endpoint` - pr_agent:A new endpoint was added in this PR +- `SQL query` - pr_agent:A new SQL query was added in this PR +- `Dockerfile changes` - pr_agent:The PR contains changes in the Dockerfile +- ... + +The list above is eclectic, and aims to give an idea of different possibilities. Define custom labels that are relevant for your repo and use cases. +Note that Labels are not mutually exclusive, so you can add multiple label categories. +Make sure to provide proper title, and a detailed and well-phrased description for each label, so the tool will know when to suggest it. +""" + output += "\n\n
    Inline File Walkthrough ๐Ÿ’Ž
    \n\n" + output += """\ +For enhanced user experience, the `describe` tool can add file summaries directly to the "Files changed" tab in the PR page. +This will enable you to quickly understand the changes in each file, while reviewing the code changes (diffs). + +To enable inline file summary, set `pr_description.inline_file_summary` in the configuration file, possible values are: +- `'table'`: File changes walkthrough table will be displayed on the top of the "Files changed" tab, in addition to the "Conversation" tab. +- `true`: A collapsable file comment with changes title and a changes summary for each file in the PR. +- `false` (default): File changes walkthrough will be added only to the "Conversation" tab. +""" + + # extra instructions + output += "
    Utilizing extra instructions
    \n\n" + output += '''\ +The `describe` tool can be configured with extra instructions, to guide the model to a feedback tailored to the needs of your project. + +Be specific, clear, and concise in the instructions. With extra instructions, you are the prompter. Notice that the general structure of the description is fixed, and cannot be changed. Extra instructions can change the content or style of each sub-section of the PR description. + +Examples for extra instructions: +``` +[pr_description] +extra_instructions="""\ +- The PR title should be in the format: ': ' +- The title should be short and concise (up to 10 words) +- ... +""" +``` +Use triple quotes to write multi-line instructions. Use bullet points to make the instructions more readable. +''' + output += "\n\n</details></td></tr>\n\n" + + + # general + output += "\n\n<tr><td><details> <summary><strong> More PR-Agent commands</strong></summary><hr> \n\n" + output += HelpMessage.get_general_bot_help_text() + output += "\n\n</details></td></tr>\n\n" + + output += "</table>" + + output += f"\n\nSee the [describe usage](https://pr-agent-docs.codium.ai/tools/describe/) page for a comprehensive guide on using this tool.\n\n" + + return output + + @staticmethod + def get_ask_usage_guide(): + output = "**Overview:**\n" + output += """\ +The `ask` tool answers questions about the PR, based on the PR code changes. +It can be invoked manually by commenting on any PR: +``` +/ask "..." +``` + +Note that the tool does not have "memory" of previous questions, and answers each question independently. +You can ask questions about the entire PR, about specific code lines, or about an image related to the PR code changes. + """ + # output += "\n\n<table>" + # + # # # general + # # output += "\n\n<tr><td><details> <summary><strong> More PR-Agent commands</strong></summary><hr> \n\n" + # # output += HelpMessage.get_general_bot_help_text() + # # output += "\n\n</details></td></tr>\n\n" + # + # output += "</table>" + + output += f"\n\nSee the [ask usage](https://pr-agent-docs.codium.ai/tools/ask/) page for a comprehensive guide on using this tool.\n\n" + + return output + + + @staticmethod + def get_improve_usage_guide(): + output = "**Overview:**\n" + output += "The code suggestions tool, named `improve`, scans the PR code changes, and automatically generates code suggestions for improving the PR." + output += "The tool can be triggered [automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) every time a new PR is opened, or can be invoked manually by commenting on a PR.\n" + output += """\ +- When commenting, to edit [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L78) related to the improve tool (`pr_code_suggestions` section), use the following template: + +``` +/improve --pr_code_suggestions.some_config1=... --pr_code_suggestions.some_config2=... +``` + +- With a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/), use the following template: + +``` +[pr_code_suggestions] +some_config1=... +some_config2=... +``` + +""" + + output += f"\n\nSee the improve [usage page](https://pr-agent-docs.codium.ai/tools/improve/) for a comprehensive guide on using this tool.\n\n" + + return output \ No newline at end of file diff --git a/pr_agent/servers/serverless.py b/pr_agent/servers/serverless.py index 421784313..c0bce6067 100644 --- a/pr_agent/servers/serverless.py +++ b/pr_agent/servers/serverless.py @@ -1,14 +1,13 @@ -import logging - from fastapi import FastAPI from mangum import Mangum +from starlette.middleware import Middleware +from starlette_context.middleware import RawContextMiddleware from pr_agent.servers.github_app import router -logger = logging.getLogger() -logger.setLevel(logging.DEBUG) -app = FastAPI() +middleware = [Middleware(RawContextMiddleware)] +app = FastAPI(middleware=middleware) app.include_router(router) handler = Mangum(app, lifespan="off") diff --git a/pr_agent/servers/utils.py b/pr_agent/servers/utils.py index c24b880ca..12dd85ae8 100644 --- a/pr_agent/servers/utils.py +++ b/pr_agent/servers/utils.py @@ -1,5 +1,8 @@ import hashlib import hmac +import time +from collections import defaultdict +from typing import Callable, Any from fastapi import HTTPException @@ -25,3 +28,59 @@ def verify_signature(payload_body, secret_token, signature_header): class RateLimitExceeded(Exception): """Raised when the git provider API rate limit has been exceeded.""" pass + + +class DefaultDictWithTimeout(defaultdict): + """A defaultdict with a time-to-live (TTL).""" + + def __init__( + self, + default_factory: Callable[[], Any] = None, + ttl: int = None, + refresh_interval: int = 60, + update_key_time_on_get: bool = True, + *args, + **kwargs, + ): + """ + Args: + default_factory: The default factory to use for keys that are not in the dictionary. + ttl: The time-to-live (TTL) in seconds. + refresh_interval: How often to refresh the dict and delete items older than the TTL. + update_key_time_on_get: Whether to update the access time of a key also on get (or only when set). + """ + super().__init__(default_factory, *args, **kwargs) + self.__key_times = dict() + self.__ttl = ttl + self.__refresh_interval = refresh_interval + self.__update_key_time_on_get = update_key_time_on_get + self.__last_refresh = self.__time() - self.__refresh_interval + + @staticmethod + def __time(): + return time.monotonic() + + def __refresh(self): + if self.__ttl is None: + return + request_time = self.__time() + if request_time - self.__last_refresh > self.__refresh_interval: + return + to_delete = [key for key, key_time in self.__key_times.items() if request_time - key_time > self.__ttl] + for key in to_delete: + del self[key] + self.__last_refresh = request_time + + def __getitem__(self, __key): + if self.__update_key_time_on_get: + self.__key_times[__key] = self.__time() + self.__refresh() + return super().__getitem__(__key) + + def __setitem__(self, __key, __value): + self.__key_times[__key] = self.__time() + return super().__setitem__(__key, __value) + + def __delitem__(self, __key): + del self.__key_times[__key] + return super().__delitem__(__key) diff --git a/pr_agent/settings/.secrets_template.toml b/pr_agent/settings/.secrets_template.toml index 36b529a6e..674a3221c 100644 --- a/pr_agent/settings/.secrets_template.toml +++ b/pr_agent/settings/.secrets_template.toml @@ -14,6 +14,11 @@ key = "" # Acquire through https://platform.openai.com #api_version = '2023-05-15' # Check Azure documentation for the current API version #api_base = "" # The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com" #deployment_id = "" # The deployment name you chose when you deployed the engine +#fallback_deployments = [] # For each fallback model specified in configuration.toml in the [config] section, specify the appropriate deployment_id + +[pinecone] +api_key = "..." +environment = "gcp-starter" [anthropic] key = "" # Optional, uncomment if you want to use Anthropic. Acquire through https://www.anthropic.com/ @@ -23,10 +28,25 @@ key = "" # Optional, uncomment if you want to use Cohere. Acquire through https: [replicate] key = "" # Optional, uncomment if you want to use Replicate. Acquire through https://replicate.com/ + +[groq] +key = "" # Acquire through https://console.groq.com/keys + +[huggingface] +key = "" # Optional, uncomment if you want to use Huggingface Inference API. Acquire through https://huggingface.co/docs/api-inference/quicktour +api_base = "" # the base url for your huggingface inference endpoint + +[ollama] +api_base = "" # the base url for your local Llama 2, Code Llama, and other models inference endpoint. Acquire through https://ollama.ai/ + +[vertexai] +vertex_project = "" # the google cloud platform project name for your vertexai deployment +vertex_location = "" # the google cloud platform location for your vertexai deployment + [github] # ---- Set the following only for deployment type == "user" user_token = "" # A GitHub personal access token with 'repo' scope. -deployment_type = "user" #set to user by default +deployment_type = "user" #set to user by default # ---- Set the following only for deployment type == "app", see README for details. private_key = """\ @@ -42,5 +62,28 @@ webhook_secret = "<WEBHOOK SECRET>" # Optional, may be commented out. personal_access_token = "" [bitbucket] -# Bitbucket personal bearer token +# For Bitbucket personal/repository bearer token bearer_token = "" + +[bitbucket_server] +# For Bitbucket Server bearer token +bearer_token = "" +webhook_secret = "" + +# For Bitbucket app +app_key = "" +base_url = "" + +[litellm] +LITELLM_TOKEN = "" # see https://docs.litellm.ai/docs/debugging/hosted_debugging for details and instructions on how to get a token + +[azure_devops] +# For Azure devops personal access token +org = "" +pat = "" + +[azure_devops_server] +# For Azure devops Server basic auth - configured in the webhook creation +# Optional, uncomment if you want to use Azure devops webhooks. Value assinged when you create the webhook +# webhook_username = "<basic auth user>" +# webhook_password = "<basic auth password>" diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index 0c502df9f..761d6cd5c 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -1,61 +1,309 @@ [config] -model="gpt-4" -fallback_models=["gpt-3.5-turbo-16k"] +# models +model="gpt-4-turbo-2024-04-09" +model_turbo="gpt-4o-2024-08-06" +fallback_models=["gpt-4o-2024-05-13"] +# CLI git_provider="github" publish_output=true publish_output_progress=true verbosity_level=0 # 0,1,2 use_extra_bad_extensions=false +# Configurations +use_wiki_settings_file=true use_repo_settings_file=true -ai_timeout=180 +use_global_settings_file=true +ai_timeout=120 # 2minutes +skip_keys = [] +# token limits max_description_tokens = 500 max_commits_tokens = 500 +max_model_tokens = 32000 # Limits the maximum number of tokens that can be used by any model, regardless of the model's default capabilities. +custom_model_max_tokens=-1 # for models not in the default list +# patch extension logic +patch_extension_skip_types =[".md",".txt"] +allow_dynamic_context=true +max_extra_lines_before_dynamic_context = 8 # will try to include up to 10 extra lines before the hunk in the patch, until we reach an enclosing function or class +patch_extra_lines_before = 3 # Number of extra lines (+3 default ones) to include before each hunk in the patch +patch_extra_lines_after = 1 # Number of extra lines (+3 default ones) to include after each hunk in the patch +secret_provider="" +cli_mode=false +ai_disclaimer_title="" # Pro feature, title for a collapsible disclaimer to AI outputs +ai_disclaimer="" # Pro feature, full text for the AI disclaimer +output_relevant_configurations=false +large_patch_policy = "clip" # "clip", "skip" +# seed +seed=-1 # set positive value to fix the seed (and ensure temperature=0) +temperature=0.2 +# ignore logic +ignore_pr_title = ["^\\[Auto\\]", "^Auto"] # a list of regular expressions to match against the PR title to ignore the PR agent +ignore_pr_target_branches = [] # a list of regular expressions of target branches to ignore from PR agent when an PR is created +ignore_pr_source_branches = [] # a list of regular expressions of source branches to ignore from PR agent when an PR is created +ignore_pr_labels = [] # labels to ignore from PR agent when an PR is created +# +is_auto_command = false # will be auto-set to true if the command is triggered by an automation +enable_ai_metadata = false # will enable adding ai metadata [pr_reviewer] # /review # -require_focused_review=true +# enable/disable features require_score_review=false require_tests_review=true +require_estimate_effort_to_review=true +require_can_be_split_review=false require_security_review=true -num_code_suggestions=3 +# soc2 +require_soc2_ticket=false +soc2_ticket_prompt="Does the PR description include a link to ticket in a project management system (e.g., Jira, Asana, Trello, etc.) ?" +# general options +num_code_suggestions=0 inline_code_comments = false ask_and_reflect=false +#automatic_review=true +persistent_comment=true extra_instructions = "" +final_update_message = true +# review labels +enable_review_labels_security=true +enable_review_labels_effort=true +# specific configurations for incremental review (/review -i) +require_all_thresholds_for_incremental_review=false +minimal_commits_for_incremental_review=0 +minimal_minutes_for_incremental_review=0 +enable_help_text=false # Determines whether to include help text in the PR review. Enabled by default. +# auto approval +enable_auto_approval=false +maximal_review_effort=5 + [pr_description] # /describe # -publish_description_as_comment=false +publish_labels=true +add_original_user_description=true +generate_ai_title=false +use_bullet_points=true extra_instructions = "" +enable_pr_type=true +final_update_message = true +enable_help_text=false +enable_help_comment=true +# describe as comment +publish_description_as_comment=false +publish_description_as_comment_persistent=true +## changes walkthrough section +enable_semantic_files_types=true +collapsible_file_list='adaptive' # true, false, 'adaptive' +inline_file_summary=false # false, true, 'table' +# markers +use_description_markers=false +include_generated_by_header=true +# large pr mode ๐Ÿ’Ž +enable_large_pr_handling=true +max_ai_calls=4 +async_ai_calls=true +mention_extra_files=true +#custom_labels = ['Bug fix', 'Tests', 'Bug fix with tests', 'Enhancement', 'Documentation', 'Other'] [pr_questions] # /ask # +enable_help_text=false + [pr_code_suggestions] # /improve # -num_code_suggestions=4 +max_context_tokens=14000 +commitable_code_suggestions = false +extra_instructions = "" +rank_suggestions = false +enable_help_text=false +enable_chat_text=true +enable_intro_text=true +persistent_comment=true +max_history_len=4 +# enable to apply suggestion ๐Ÿ’Ž +apply_suggestions_checkbox=true +# suggestions scoring +self_reflect_on_suggestions=true +suggestions_score_threshold=0 # [0-10]. highly recommend not to set this value above 8, since above it may clip highly relevant suggestions +# params for '/improve --extended' mode +auto_extended_mode=true +num_code_suggestions_per_chunk=4 +max_number_of_calls = 3 +parallel_calls = true +rank_extended_suggestions = false +final_clip_factor = 0.8 +# self-review checkbox +demand_code_suggestions_self_review=false # add a checkbox for the author to self-review the code suggestions +code_suggestions_self_review_text= "**Author self-review**: I have reviewed the PR code suggestions, and addressed the relevant ones." +approve_pr_on_self_review=false # Pro feature. if true, the PR will be auto-approved after the author clicks on the self-review checkbox +# Suggestion impact +publish_post_process_suggestion_impact=true + +[pr_custom_prompt] # /custom_prompt # +prompt = """\ +The code suggestions should focus only on the following: +- ... +- ... +... +""" +suggestions_score_threshold=0 +num_code_suggestions_per_chunk=4 +self_reflect_on_custom_suggestions=true +enable_help_text=false + + +[pr_add_docs] # /add_docs # extra_instructions = "" +docs_style = "Sphinx" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText" +file = "" # in case there are several components with the same name, you can specify the relevant file +class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name [pr_update_changelog] # /update_changelog # push_changelog_changes=false extra_instructions = "" +[pr_analyze] # /analyze # +enable_help_text=true + +[pr_test] # /test # +extra_instructions = "" +testing_framework = "" # specify the testing framework you want to use +num_tests=3 # number of tests to generate. max 5. +avoid_mocks=true # if true, the generated tests will prefer to use real objects instead of mocks +file = "" # in case there are several components with the same name, you can specify the relevant file +class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name +enable_help_text=false + +[pr_improve_component] # /improve_component # +num_code_suggestions=4 +extra_instructions = "" +file = "" # in case there are several components with the same name, you can specify the relevant file +class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name + +[checks] # /checks (pro feature) # +enable_auto_checks_feedback=true +excluded_checks_list=["lint"] # list of checks to exclude, for example: ["check1", "check2"] +persistent_comment=true +enable_help_text=true +final_update_message = false + +[pr_help] # /help # +force_local_db=false +num_retrieved_snippets=5 + [pr_config] # /config # [github] # The type of deployment to create. Valid values are 'app' or 'user'. deployment_type = "user" ratelimit_retries = 5 +base_url = "https://api.github.com" +publish_inline_comments_fallback_with_verification = true +try_fix_invalid_inline_comments = true +app_name = "pr-agent" +ignore_bot_pr = true -[gitlab] -# URL to the gitlab service -url = "https://gitlab.com" +[github_action_config] +# auto_review = true # set as env var in .github/workflows/pr-agent.yaml +# auto_describe = true # set as env var in .github/workflows/pr-agent.yaml +# auto_improve = true # set as env var in .github/workflows/pr-agent.yaml +# pr_actions = ['opened', 'reopened', 'ready_for_review', 'review_requested'] -# Polling (either project id or namespace/project_name) syntax can be used -projects_to_monitor = ['org_name/repo_name'] +[github_app] +# these toggles allows running the github app from custom deployments +bot_user = "github-actions[bot]" +override_deployment_type = true +# settings for "pull_request" event +handle_pr_actions = ['opened', 'reopened', 'ready_for_review'] +pr_commands = [ + "/describe --pr_description.final_update_message=false", + "/review --pr_reviewer.num_code_suggestions=0", + "/improve", +] +# settings for "pull_request" event with "synchronize" action - used to detect and handle push triggers for new commits +handle_push_trigger = false +push_trigger_ignore_bot_commits = true +push_trigger_ignore_merge_commits = true +push_trigger_wait_for_initial_review = true +push_trigger_pending_tasks_backlog = true +push_trigger_pending_tasks_ttl = 300 +push_commands = [ + "/describe", + "/review --pr_reviewer.num_code_suggestions=0", +] -# Polling trigger -magic_word = "AutoReview" +[gitlab] +url = "https://gitlab.com" +pr_commands = [ + "/describe --pr_description.final_update_message=false", + "/review --pr_reviewer.num_code_suggestions=0", + "/improve", +] +handle_push_trigger = false +push_commands = [ + "/describe", + "/review --pr_reviewer.num_code_suggestions=0", +] -# Polling interval -polling_interval_seconds = 30 +[bitbucket_app] +pr_commands = [ + "/describe --pr_description.final_update_message=false", + "/review --pr_reviewer.num_code_suggestions=0", + "/improve --pr_code_suggestions.commitable_code_suggestions=true --pr_code_suggestions.suggestions_score_threshold=7", +] +avoid_full_files = false [local] # LocalGitProvider settings - uncomment to use paths other than default # description_path= "path/to/description.md" -# review_path= "path/to/review.md" \ No newline at end of file +# review_path= "path/to/review.md" + +[gerrit] +# endpoint to the gerrit service +# url = "ssh://gerrit.example.com:29418" +# user for gerrit authentication +# user = "ai-reviewer" +# patch server where patches will be saved +# patch_server_endpoint = "http://127.0.0.1:5000/patch" +# token to authenticate in the patch server +# patch_server_token = "" + +[bitbucket_server] +# URL to the BitBucket Server instance +# url = "https://git.bitbucket.com" +url = "" +pr_commands = [ + "/describe --pr_description.final_update_message=false", + "/review --pr_reviewer.num_code_suggestions=0", + "/improve --pr_code_suggestions.commitable_code_suggestions=true --pr_code_suggestions.suggestions_score_threshold=7", +] + +[litellm] +# use_client = false +# drop_params = false +enable_callbacks = false +success_callback = [] +failure_callback = [] +service_callback = [] + +[pr_similar_issue] +skip_comments = false +force_update_dataset = false +max_issues_to_scan = 500 +vectordb = "pinecone" + +[pr_find_similar_component] +class_name = "" +file = "" +search_from_org = false +allow_fallback_less_words = true +number_of_keywords = 5 +number_of_results = 5 + +[pinecone] +# fill and place in .secrets.toml +#api_key = ... +# environment = "gcp-starter" + +[lancedb] +uri = "./lancedb" +[best_practices] +content = "" +max_lines_allowed = 800 +enable_global_best_practices = false \ No newline at end of file diff --git a/pr_agent/settings/custom_labels.toml b/pr_agent/settings/custom_labels.toml new file mode 100644 index 000000000..ee45fb198 --- /dev/null +++ b/pr_agent/settings/custom_labels.toml @@ -0,0 +1,16 @@ +[config] +enable_custom_labels=false + +## template for custom labels +#[custom_labels."Bug fix"] +#description = """Fixes a bug in the code""" +#[custom_labels."Tests"] +#description = """Adds or modifies tests""" +#[custom_labels."Bug fix with tests"] +#description = """Fixes a bug in the code and adds or modifies tests""" +#[custom_labels."Enhancement"] +#description = """Adds new features or modifies existing ones""" +#[custom_labels."Documentation"] +#description = """Adds or modifies documentation""" +#[custom_labels."Other"] +#description = """Other changes that do not fit in any of the above categories""" \ No newline at end of file diff --git a/pr_agent/settings/ignore.toml b/pr_agent/settings/ignore.toml new file mode 100644 index 000000000..bc847cfcc --- /dev/null +++ b/pr_agent/settings/ignore.toml @@ -0,0 +1,12 @@ +[ignore] + +glob = [ + # Ignore files and directories matching these glob patterns. + # See https://docs.python.org/3/library/glob.html + 'vendor/**', +] +regex = [ + # Ignore files and directories matching these regex patterns. + # See https://learnbyexample.github.io/python-regex-cheatsheet/ + # for example: regex = ['.*\.toml$'] +] diff --git a/pr_agent/settings/language_extensions.toml b/pr_agent/settings/language_extensions.toml index bff5d53f5..2f3e5542b 100644 --- a/pr_agent/settings/language_extensions.toml +++ b/pr_agent/settings/language_extensions.toml @@ -44,6 +44,7 @@ default = [ 'ss', 'svg', 'tar', + 'tgz', 'tsv', 'ttf', 'war', @@ -53,7 +54,8 @@ default = [ 'xz', 'zip', 'zst', - 'snap' + 'snap', + 'lockb' ] extra = [ 'md', @@ -61,6 +63,7 @@ extra = [ ] [language_extension_map_org] +"1C Enterprise" = ["*.bsl", ] ABAP = [".abap", ] "AGS Script" = [".ash", ] AMPL = [".ampl", ] @@ -432,3 +435,6 @@ reStructuredText = [".rst", ".rest", ".rest.txt", ".rst.txt", ] wisp = [".wisp", ] xBase = [".prg", ".prw", ] +[docs_blacklist_extensions] +# Disable docs for these extensions of text files and scripts that are not programming languages of function, classes and methods +docs_blacklist = ['sql', 'txt', 'yaml', 'json', 'xml', 'md', 'rst', 'rest', 'rest.txt', 'rst.txt', 'mdpolicy', 'mdown', 'markdown', 'mdwn', 'mkd', 'mkdn', 'mkdown', 'sh'] \ No newline at end of file diff --git a/pr_agent/settings/pr_add_docs.toml b/pr_agent/settings/pr_add_docs.toml new file mode 100644 index 000000000..c3f732ee1 --- /dev/null +++ b/pr_agent/settings/pr_add_docs.toml @@ -0,0 +1,126 @@ +[pr_add_docs_prompt] +system="""You are PR-Doc, a language model that specializes in generating documentation for code components in a Pull Request (PR). +Your task is to generate {{ docs_for_language }} for code components in the PR Diff. + + +Example for the PR Diff format: +====== +## File: 'src/file1.py' + +@@ -12,3 +12,4 @@ def func1(): +__new hunk__ +12 code line1 that remained unchanged in the PR +14 +new code line1 added in the PR +15 +new code line2 added in the PR +16 code line2 that remained unchanged in the PR +__old hunk__ + code line1 that remained unchanged in the PR +-code line that was removed in the PR + code line2 that remained unchanged in the PR + +@@ ... @@ def func2(): +__new hunk__ +... +__old hunk__ +... + + +## File: 'src/file2.py' +... +====== + + +Specific instructions: +- Try to identify edited/added code components (classes/functions/methods...) that are undocumented, and generate {{ docs_for_language }} for each one. +- If there are documented (any type of {{ language }} documentation) code components in the PR, Don't generate {{ docs_for_language }} for them. +- Ignore code components that don't appear fully in the '__new hunk__' section. For example, you must see the component header and body. +- Make sure the {{ docs_for_language }} starts and ends with standard {{ language }} {{ docs_for_language }} signs. +- The {{ docs_for_language }} should be in standard format. +- Provide the exact line number (inclusive) where the {{ docs_for_language }} should be added. + + +{%- if extra_instructions %} + +Extra instructions from the user: +====== +{{ extra_instructions }} +====== +{%- endif %} + + +You must use the following YAML schema to format your answer: +```yaml +Code Documentation: + type: array + uniqueItems: true + items: + relevant file: + type: string + description: The full file path of the relevant file. + relevant line: + type: integer + description: |- + The relevant line number from a '__new hunk__' section where the {{ docs_for_language }} should be added. + doc placement: + type: string + enum: + - before + - after + description: |- + The {{ docs_for_language }} placement relative to the relevant line (code component). + For example, in Python the docs are placed after the function signature, but in Java they are placed before. + documentation: + type: string + description: |- + The {{ docs_for_language }} content. It should be complete, correctly formatted and indented, and without line numbers. +``` + +Example output: +```yaml +Code Documentation: +- relevant file: |- + src/file1.py + relevant lines: 12 + doc placement: after + documentation: |- + \"\"\" + This is a python docstring for func1. + \"\"\" +- ... +... +``` + + +Each YAML output MUST be after a newline, indented, with block scalar indicator ('|-'). +Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +""" + +user="""PR Info: + +Title: '{{ title }}' + +Branch: '{{ branch }}' + +{%- if description %} + +Description: +====== +{{ description|trim }} +====== +{%- endif %} + +{%- if language %} + +Main PR language: '{{language}}' +{%- endif %} + + +The PR Diff: +====== +{{ diff|trim }} +====== + + +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml index 76a3cb6be..d50bdc535 100644 --- a/pr_agent/settings/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -1,90 +1,122 @@ [pr_code_suggestions_prompt] -system="""You are a language model called CodiumAI-PR-Code-Reviewer. -Your task is to provide meaningfull non-trivial code suggestions to improve the new code in a PR (the '+' lines). -- Try to give important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. -- Suggestions should refer only to the 'new hunk' code, and focus on improving the new added code lines, with '+'. -- Provide the exact line number range (inclusive) for each issue. -- Assume there is additional code in the relevant file that is not included in the diff. -- Provide up to {{ num_code_suggestions }} code suggestions. -- Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines). -- Don't output line numbers in the 'improved code' snippets. +system="""You are PR-Reviewer, an AI specializing in Pull Request (PR) code analysis and suggestions. +Your task is to examine the provided code diff, focusing on new code (lines prefixed with '+'), and offer concise, actionable suggestions to fix possible bugs and problems, and enhance code quality, readability, and performance. -{%- if extra_instructions %} -Extra instructions from the user: -{{ extra_instructions }} -{% endif %} - -You must use the following JSON schema to format your answer: -```json -{ - "Code suggestions": { - "type": "array", - "minItems": 1, - "maxItems": {{ num_code_suggestions }}, - "uniqueItems": "true", - "items": { - "relevant file": { - "type": "string", - "description": "the relevant file full path" - }, - "suggestion content": { - "type": "string", - "description": "a concrete suggestion for meaningfully improving the new PR code." - }, - "existing code": { - "type": "string", - "description": "a code snippet showing authentic relevant code lines from a 'new hunk' section. It must be continuous, correctly formatted and indented, and without line numbers." - }, - "relevant lines": { - "type": "string", - "description": "the relevant lines in the 'new hunk' sections, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." - }, - "improved code": { - "type": "string", - "description": "a new code snippet that can be used to replace the relevant lines in 'new hunk' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." - } - } - } -} -``` +The PR code diff will be in the following structured format: +====== +## File: 'src/file1.py' +{%- if is_ai_metadata %} +### AI-generated changes summary: +* ... +* ... +{%- endif %} -Example input: -' -## src/file1.py ----new_hunk--- -``` -[new hunk code, annotated with line numbers] -``` ----old_hunk--- -``` -[old hunk code] -``` +@@ ... @@ def func1(): +__new hunk__ +11 unchanged code line0 in the PR +12 unchanged code line1 in the PR +13 +new code line2 added in the PR +14 unchanged code line3 in the PR +__old hunk__ + unchanged code line0 + unchanged code line1 +-old code line2 removed in the PR + unchanged code line3 + +@@ ... @@ def func2(): +__new hunk__ +... +__old hunk__ ... -' -Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. -""" -user="""PR Info: -Title: '{{title}}' -Branch: '{{branch}}' -Description: '{{description}}' -{%- if language %} -Main language: {{language}} +## File: 'src/file2.py' +... +====== + +- In the format above, the diff is organized into separate '__new hunk__' and '__old hunk__' sections for each code chunk. '__new hunk__' contains the updated code, while '__old hunk__' shows the removed code. If no code was added or removed in a specific chunk, the corresponding section will be omitted. +- Line numbers were added for the '__new hunk__' sections to help referencing specific lines in the code suggestions. These numbers are for reference only and are not part of the actual code. +- Code lines are prefixed with symbols: '+' for new code added in the PR, '-' for code removed, and ' ' for unchanged code. +{%- if is_ai_metadata %} +- When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or complete. {%- endif %} -{%- if commit_messages_str %} -Commit messages: -{{commit_messages_str}} + +Specific guidelines for generating code suggestions: +- Provide up to {{ num_code_suggestions }} distinct and insightful code suggestions. +- Focus solely on enhancing new code introduced in the PR, identified by '+' prefixes in '__new hunk__' sections (after the line numbers). +- Prioritize suggestions that address potential issues, critical problems, and bugs in the PR code. Avoid repeating changes already implemented in the PR. If no pertinent suggestions are applicable, return an empty list. +- Avoid proposing additions of docstrings, type hints, or comments, or the removal of unused imports. +- When referencing variables or names from the code, enclose them in backticks (`). Example: "ensure that `variable_name` is..." +- Be mindful you are viewing a partial PR code diff, not the full codebase. Avoid suggestions that might conflict with unseen code or alerting on variables not declared in the visible scope, as the context is incomplete. + + +{%- if extra_instructions %} + + +Extra user-provided instructions (should be addressed with high priority): +====== +{{ extra_instructions }} +====== {%- endif %} -The PR Diff: -``` -{{diff}} +The output must be a YAML object equivalent to type $PRCodeSuggestions, according to the following Pydantic definitions: +===== +class CodeSuggestion(BaseModel): + relevant_file: str = Field(description="Full path of the relevant file") + language: str = Field(description="Programming language used by the relevant file") + suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise") + existing_code: str = Field(description="A short code snippet from a '__new hunk__' section that the suggestion aims to enhance or fix. Include only complete code lines, without line numbers. Use ellipsis (...) for brevity if needed. This snippet should represent the specific PR code targeted for improvement.") + improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.") + one_sentence_summary: str = Field(description="A concise, single-sentence overview of the suggested improvement. Focus on the 'what'. Be general, and avoid method or variable names.") + relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the hunk line numbers, and correspond to the beginning of the 'existing code' snippet above") + relevant_lines_end: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion ends (inclusive). Should be derived from the hunk line numbers, and correspond to the end of the 'existing code' snippet above") + label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability'. Other relevant labels are also acceptable.") + + +class PRCodeSuggestions(BaseModel): + code_suggestions: List[CodeSuggestion] +===== + + +Example output: +```yaml +code_suggestions: +- relevant_file: | + src/file1.py + language: | + python + suggestion_content: | + ... + existing_code: | + ... + improved_code: | + ... + one_sentence_summary: | + ... + relevant_lines_start: 12 + relevant_lines_end: 13 + label: | + ... ``` -Response (should be a valid JSON, and nothing else): -```json + +Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). """ + +user="""--PR Info-- + +Title: '{{title}}' + + +The PR Diff: +====== +{{ diff|trim }} +====== + + +Response (should be a valid YAML, and nothing else): +```yaml +""" \ No newline at end of file diff --git a/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml b/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml new file mode 100644 index 000000000..e029269b9 --- /dev/null +++ b/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml @@ -0,0 +1,115 @@ +[pr_code_suggestions_reflect_prompt] +system="""You are an AI language model specialized in reviewing and evaluating code suggestions for a Pull Request (PR). +Your task is to analyze a PR code diff and evaluate a set of AI-generated code suggestions. These suggestions aim to address potential bugs and problems, and enhance the new code introduced in the PR. + +Examine each suggestion meticulously, assessing its quality, relevance, and accuracy within the context of PR. Keep in mind that the suggestions may vary in their correctness and accuracy. Your evaluation should be based on a thorough comparison between each suggestion and the actual PR code diff. +Consider the following components of each suggestion: + 1. 'one_sentence_summary' - A brief summary of the suggestion's purpose + 2. 'suggestion_content' - The detailed suggestion content, explaining the proposed modification + 3. 'existing_code' - a code snippet from a __new hunk__ section in the PR code diff that the suggestion addresses + 4. 'improved_code' - a code snippet demonstrating how the 'existing_code' should be after the suggestion is applied + +Be particularly vigilant for suggestions that: + - Overlook crucial details in the PR + - The 'improved_code' section does not accurately reflect the suggested changes, in relation to the 'existing_code' + - Contradict or ignore parts of the PR's modifications +In such cases, assign the suggestion a score of 0. + +For valid suggestions, your role is to provide an impartial and precise score assessment that accurately reflects each suggestion's potential impact on the PR's correctness, quality and functionality. + + +Key guidelines for evaluation: +- Thoroughly examine both the suggestion content and the corresponding PR code diff. Be vigilant for potential errors in each suggestion, ensuring they are logically sound, accurate, and directly derived from the PR code diff. +- Extend your review beyond the specifically mentioned code lines to encompass surrounding context, verifying the suggestions' contextual accuracy. +- Validate the 'existing_code' field by confirming it matches or is accurately derived from code lines within a '__new hunk__' section of the PR code diff. +- Ensure the 'improved_code' section accurately reflects the 'existing_code' segment after the suggested modification is applied. +- Apply a nuanced scoring system: + - Reserve high scores (8-10) for suggestions addressing critical issues such as major bugs or security concerns. + - Assign moderate scores (3-7) to suggestions that tackle minor issues, improve code style, enhance readability, or boost maintainability. + - Avoid inflating scores for suggestions that, while correct, offer only marginal improvements or optimizations. +- Maintain the original order of suggestions in your feedback, corresponding to their input sequence. + + +The PR code diff will be presented in the following structured format: +====== +## File: 'src/file1.py' +{%- if is_ai_metadata %} +### AI-generated changes summary: +* ... +* ... +{%- endif %} + +@@ ... @@ def func1(): +__new hunk__ +11 unchanged code line0 in the PR +12 unchanged code line1 in the PR +13 +new code line2 added in the PR +14 unchanged code line3 in the PR +__old hunk__ + unchanged code line0 + unchanged code line1 +-old code line2 removed in the PR + unchanged code line3 + +@@ ... @@ def func2(): +__new hunk__ +... +__old hunk__ +... + + +## File: 'src/file2.py' +... +====== +- In the format above, the diff is organized into separate '__new hunk__' and '__old hunk__' sections for each code chunk. '__new hunk__' contains the updated code, while '__old hunk__' shows the removed code. If no code was added or removed in a specific chunk, the corresponding section will be omitted. +- Line numbers are included for the '__new hunk__' sections to enable referencing specific lines in the code suggestions. These numbers are for reference only and are not part of the actual code. +- Code lines are prefixed with symbols: '+' for new code added in the PR, '-' for code removed, and ' ' for unchanged code. +{%- if is_ai_metadata %} +- When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or comprehensive. +{%- endif %} + + +The output must be a YAML object equivalent to type $PRCodeSuggestionsFeedback, according to the following Pydantic definitions: +===== +class CodeSuggestionFeedback(BaseModel): + suggestion_summary: str = Field(description="Repeated from the input") + relevant_file: str = Field(description="Repeated from the input") + suggestion_score: int = Field(description="Evaluate the suggestion and assign a score from 0 to 10. Give 0 if the suggestion is wrong. For valid suggestions, score from 1 (lowest impact/importance) to 10 (highest impact/importance).") + why: str = Field(description="Briefly explain the score given in 1-2 sentences, focusing on the suggestion's impact, relevance, and accuracy.") + +class PRCodeSuggestionsFeedback(BaseModel): + code_suggestions: List[CodeSuggestionFeedback] +===== + + +Example output: +```yaml +code_suggestions: +- suggestion_summary: | + Use a more descriptive variable name here + relevant_file: "src/file1.py" + suggestion_score: 6 + why: | + The variable name 't' is not descriptive enough +- ... +``` + + +Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). +""" + +user="""You are given a Pull Request (PR) code diff: +====== +{{ diff|trim }} +====== + + +Below are {{ num_code_suggestions }} AI-generated code suggestions for enhancing the Pull Request: +====== +{{ suggestion_str|trim }} +====== + + +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/settings/pr_custom_labels.toml b/pr_agent/settings/pr_custom_labels.toml new file mode 100644 index 000000000..44b0ada89 --- /dev/null +++ b/pr_agent/settings/pr_custom_labels.toml @@ -0,0 +1,86 @@ +[pr_custom_labels_prompt] +system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). +Your task is to provide labels that describe the PR content. +{%- if enable_custom_labels %} +Thoroughly read the labels name and the provided description, and decide whether the label is relevant to the PR. +{%- endif %} + +{%- if extra_instructions %} + +Extra instructions from the user: +====== +{{ extra_instructions }} +====== +{% endif %} + + +The output must be a YAML object equivalent to type $Labels, according to the following Pydantic definitions: +====== +{%- if enable_custom_labels %} + +{{ custom_labels_class }} + +{%- else %} +class Label(str, Enum): + bug_fix = "Bug fix" + tests = "Tests" + enhancement = "Enhancement" + documentation = "Documentation" + other = "Other" +{%- endif %} + +class Labels(BaseModel): + labels: List[Label] = Field(min_items=0, description="choose the relevant custom labels that describe the PR content, and return their keys. Use the value field of the Label object to better understand the label meaning.") +====== + + +Example output: + +```yaml +labels: +- ... +- ... +``` + +Answer should be a valid YAML, and nothing else. +""" + +user="""PR Info: + +Previous title: '{{title}}' + +Branch: '{{ branch }}' + +{%- if description %} + +Description: +====== +{{ description|trim }} +====== +{%- endif %} + +{%- if language %} + +Main PR language: '{{ language }}' +{%- endif %} +{%- if commit_messages_str %} + + +Commit messages: +====== +{{ commit_messages_str|trim }} +====== +{%- endif %} + + +The PR Git Diff: +====== +{{ diff|trim }} +====== + +Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines. + + +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/settings/pr_description_prompts.toml b/pr_agent/settings/pr_description_prompts.toml index 43dd8e3b6..de7c3d548 100644 --- a/pr_agent/settings/pr_description_prompts.toml +++ b/pr_agent/settings/pr_description_prompts.toml @@ -1,86 +1,130 @@ [pr_description_prompt] -system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests. -Your task is to provide full description of the PR content. -- Make sure not to focus the new PR code (the '+' lines). -- Notice that the 'Previous title', 'Previous description' and 'Commit messages' sections may be partial, simplistic, non-informative or not up-to-date. Hence, compare them to the PR diff code, and use them only as a reference. -- If needed, each YAML output should be in block scalar format ('|-') +system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). +{%- if enable_custom_labels %} +Your task is to provide a full description for the PR content - files walkthrough, title, type, description and labels. +{%- else %} +Your task is to provide a full description for the PR content - files walkthrough, title, type, and description. +{%- endif %} +- Focus on the new PR code (lines starting with '+'). +- Keep in mind that the 'Previous title', 'Previous description' and 'Commit messages' sections may be partial, simplistic, non-informative or out of date. Hence, compare them to the PR diff code, and use them only as a reference. +- The generated title and description should prioritize the most significant changes. +- If needed, each YAML output should be in block scalar indicator ('|-') +- When quoting variables or names from the code, use backticks (`) instead of single quote ('). + {%- if extra_instructions %} Extra instructions from the user: -{{ extra_instructions }} +===== +{{extra_instructions}} +===== {% endif %} -You must use the following YAML schema to format your answer: -```yaml -PR Title: - type: string - description: an informative title for the PR, describing its main theme -PR Type: - type: array - items: - type: string - enum: - - Bug fix - - Tests - - Bug fix with tests - - Refactoring - - Enhancement - - Documentation - - Other -PR Description: - type: string - description: an informative and concise description of the PR -PR Main Files Walkthrough: - type: array - maxItems: 10 - description: |- - a walkthrough of the PR changes. Review main files, and shortly describe the changes in each file (up to 10 most important files). - items: - filename: - type: string - description: the relevant file full path - changes in file: - type: string - description: minimal and concise description of the changes in the relevant file + +The output must be a YAML object equivalent to type $PRDescription, according to the following Pydantic definitions: +===== +class PRType(str, Enum): + bug_fix = "Bug fix" + tests = "Tests" + enhancement = "Enhancement" + documentation = "Documentation" + other = "Other" + +{%- if enable_custom_labels %} + +{{ custom_labels_class }} + +{%- endif %} + +{%- if enable_semantic_files_types %} + +class FileDescription(BaseModel): + filename: str = Field(description="The full file path of the relevant file.") + language: str = Field(description="The programming language of the relevant file.") + changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).") + changes_title: str = Field(description="an informative title for the changes in the files, describing its main theme (5-10 words).") + label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...") +{%- endif %} + +class PRDescription(BaseModel): + type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')") +{%- if enable_semantic_files_types %} + pr_files: List[FileDescription] = Field(max_items=15, description="a list of the files in the PR, and summary of their changes") +{%- endif %} + description: str = Field(description="an informative and concise description of the PR. Use bullet points. Display first the most significant changes.") + title: str = Field(description="an informative title for the PR, describing its main theme") +{%- if enable_custom_labels %} + labels: List[Label] = Field(min_items=0, description="choose the relevant custom labels that describe the PR content, and return their keys. Use the value field of the Label object to better understand the label meaning.") +{%- endif %} +===== Example output: + ```yaml -PR Title: |- +type: +- ... +- ... +{%- if enable_semantic_files_types %} +pr_files: +- filename: | + ... + language: | + ... + changes_summary: | + ... + changes_title: | + ... + label: | + ... +... +{%- endif %} +description: |- + ... +title: |- ... -PR Type: - - Bug fix -PR Description: |- +{%- if enable_custom_labels %} +labels: +- | ... -PR Main Files Walkthrough: - - ... - - ... +- | + ... +{%- endif %} ``` -Make sure to output a valid YAML. Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ user="""PR Info: + Previous title: '{{title}}' -Previous description: '{{description}}' -Branch: '{{branch}}' -{%- if language %} -Main language: {{language}} +{%- if description %} + +Previous description: +===== +{{ description|trim }} +===== {%- endif %} + +Branch: '{{branch}}' + {%- if commit_messages_str %} Commit messages: -{{commit_messages_str}} +===== +{{ commit_messages_str|trim }} +===== {%- endif %} -The PR Git Diff: -``` -{{diff}} -``` +The PR Diff: +===== +{{ diff|trim }} +===== + Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines. + Response (should be a valid YAML, and nothing else): ```yaml """ diff --git a/pr_agent/settings/pr_evaluate_prompt_response.toml b/pr_agent/settings/pr_evaluate_prompt_response.toml new file mode 100644 index 000000000..9a6813930 --- /dev/null +++ b/pr_agent/settings/pr_evaluate_prompt_response.toml @@ -0,0 +1,68 @@ +[pr_evaluate_prompt] +prompt="""\ +You are the PR-task-evaluator, a language model that compares and ranks the quality of two responses provided in response to a lengthy task regarding a Pull Request (PR) code diff. + + +The task to be evaluated is: + +***** Start of Task ***** +{{pr_task|trim}} + +***** End of Task ***** + + + +Response 1 to the task is: + +***** Start of Response 1 ***** + +{{pr_response1|trim}} + +***** End of Response 1 ***** + + + +Response 2 to the task is: + +***** Start of Response 2 ***** + +{{pr_response2|trim}} + +***** End of Response 2 ***** + + + +Guidelines to evaluate the responses: +- Thoroughly read the 'Task' part. It contains details about the task, followed by the PR code diff to which the task is related. +- Thoroughly read 'Response1' and 'Response2' parts. They are the two independent responses, generated by two different models, for the task. + +After that, rank each response. Criterions to rank each response: +- How well does the response follow the specific task instructions and requirements? +- How well does the response analyze and understand the PR code diff? +- How well will a person perceive it as a good response that correctly addresses the task? +- How well does the response prioritize key feedback, related to the task instructions, that a human reader seeing that feedback would also consider as important? +- Don't necessarily rank higher a response that is longer. A shorter response might be better if it is more concise, and still addresses the task better. + + +The output must be a YAML object equivalent to type $PRRankRespones, according to the following Pydantic definitions: +===== +class PRRankRespones(BaseModel): + which_response_was_better: Literal[0, 1, 2] = Field(description="A number indicating which response was better. 0 means both responses are equally good.") + why: str = Field(description="In a short and concise manner, explain why the chosen response is better than the other. Be specific and give examples if relevant.") + score_response1: int = Field(description="A score between 1 and 10, indicating the quality of the response1, based on the criterions mentioned in the prompt.") + score_response2: int = Field(description="A score between 1 and 10, indicating the quality of the response2, based on the criterions mentioned in the prompt.") +===== + + +Example output: +```yaml +which_response_was_better: "X" +why: "Response X is better because it is more practical, and addresses the task requirements better since ..." +score_response1: ... +score_response2: ... +``` + + +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/settings/pr_help_prompts.toml b/pr_agent/settings/pr_help_prompts.toml new file mode 100644 index 000000000..84ecb3efc --- /dev/null +++ b/pr_agent/settings/pr_help_prompts.toml @@ -0,0 +1,48 @@ +[pr_help_prompts] +system="""You are Doc-helper, a language models designed to answer questions about a documentation website for an open-soure project called "PR-Agent". +You will recieve a question, and a list of snippets that were collected for a documentation site using RAG as the retrieval method. +Your goal is to provide the best answer to the question using the snippets provided. + +Additional instructions: +- Try to be short and concise in your answers. Give examples if needed. +- It is possible some of the snippets may not be relevant to the question. In that case, you should ignore them and focus on the ones that are relevant. +- The main tools of pr-agent are 'describe', 'review', 'improve'. If there is ambiguity to which tool the user is referring to, prioritize snippets of these tools over others. + + +The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions: +===== +class DocHelper(BaseModel): + user_question: str = Field(description="The user's question") + response: str = Field(description="The response to the user's question") + relevant_snippets: List[int] = Field(description="One-based index of the relevant snippets in the list of snippets provided. Order the by relevance, with the most relevant first. If a snippet was not relevant, do not include it in the list.") +===== + + +Example output: +```yaml +user_question: | + ... +response: | + ... +relevant_snippets: + - 1 + - 2 + - 4 +""" + +user="""\ +User's Question: +===== +{{ question|trim }} +===== + + +Relevant doc snippets retrieved: +===== +{{ snippets|trim }} +===== + + +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/settings/pr_information_from_user_prompts.toml b/pr_agent/settings/pr_information_from_user_prompts.toml index 8d628f7a6..35ea5448d 100644 --- a/pr_agent/settings/pr_information_from_user_prompts.toml +++ b/pr_agent/settings/pr_information_from_user_prompts.toml @@ -1,5 +1,5 @@ [pr_information_from_user_prompt] -system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests. +system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). Given the PR Info and the PR Git Diff, generate 3 short questions about the PR code for the PR author. The goal of the questions is to help the language model understand the PR better, so the questions should be insightful, informative, non-trivial, and relevant to the PR. You should prefer asking yes\\no questions, or multiple choice questions. Also add at least one open-ended question, but make sure they are not too difficult, and can be answered in a sentence or two. @@ -16,22 +16,36 @@ Questions to better understand the PR: user="""PR Info: Title: '{{title}}' + Branch: '{{branch}}' -Description: '{{description}}' + +{%- if description %} + +Description: +====== +{{ description|trim }} +====== +{%- endif %} + {%- if language %} -Main language: {{language}} + +Main PR language: '{{ language }}' {%- endif %} {%- if commit_messages_str %} + Commit messages: -{{commit_messages_str}} +====== +{{ commit_messages_str|trim }} +====== {%- endif %} The PR Git Diff: -``` -{{diff}} -``` +====== +{{ diff|trim }} +====== + Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines diff --git a/pr_agent/settings/pr_line_questions_prompts.toml b/pr_agent/settings/pr_line_questions_prompts.toml new file mode 100644 index 000000000..2d32223dd --- /dev/null +++ b/pr_agent/settings/pr_line_questions_prompts.toml @@ -0,0 +1,53 @@ +[pr_line_questions_prompt] +system="""You are PR-Reviewer, a language model designed to answer questions about a Git Pull Request (PR). + +Your goal is to answer questions\\tasks about specific lines of code in the PR, and provide feedback. +Be informative, constructive, and give examples. Try to be as specific as possible. +Don't avoid answering the questions. You must answer the questions, as best as you can, without adding any unrelated content. + +Additional guidelines: +- When quoting variables or names from the code, use backticks (`) instead of single quote ('). +- If relevant, use bullet points. +- Be short and to the point. + +Example Hunk Structure: +====== +## File: 'src/file1.py' + +@@ -12,5 +12,5 @@ def func1(): +code line 1 that remained unchanged in the PR +code line 2 that remained unchanged in the PR +-code line that was removed in the PR ++code line added in the PR +code line 3 that remained unchanged in the PR +====== + +""" + +user="""PR Info: + +Title: '{{title}}' + +Branch: '{{branch}}' + + +Here is a context hunk from the PR diff: +====== +{{ full_hunk|trim }} +====== + + +Now focus on the selected lines from the hunk: +====== +{{ selected_lines|trim }} +====== +Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines + + +A question about the selected lines: +====== +{{ question|trim }} +====== + +Response to the question: +""" diff --git a/pr_agent/settings/pr_questions_prompts.toml b/pr_agent/settings/pr_questions_prompts.toml index e306684d2..08dd769e5 100644 --- a/pr_agent/settings/pr_questions_prompts.toml +++ b/pr_agent/settings/pr_questions_prompts.toml @@ -1,36 +1,42 @@ [pr_questions_prompt] -system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests. -Your task is to answer questions about the new PR code (the '+' lines), and provide feedback. +system="""You are PR-Reviewer, a language model designed to answer questions about a Git Pull Request (PR). + +Your goal is to answer questions\\tasks about the new code introduced in the PR (lines starting with '+' in the 'PR Git Diff' section), and provide feedback. Be informative, constructive, and give examples. Try to be as specific as possible. -Don't avoid answering the questions. You must answer the questions, as best as you can, without adding unrelated content. -Make sure not to repeat modifications already implemented in the new PR code (the '+' lines). +Don't avoid answering the questions. You must answer the questions, as best as you can, without adding any unrelated content. """ user="""PR Info: + Title: '{{title}}' + Branch: '{{branch}}' -Description: '{{description}}' -{%- if language %} -Main language: {{language}} + +{%- if description %} + +Description: +====== +{{ description|trim }} +====== {%- endif %} -{%- if commit_messages_str %} -Commit messages: -{{commit_messages_str}} +{%- if language %} + +Main PR language: '{{ language }}' {%- endif %} The PR Git Diff: -``` -{{diff}} -``` +====== +{{ diff|trim }} +====== Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines The PR Questions: -``` -{{ questions }} -``` +====== +{{ questions|trim }} +====== -Response: +Response to the PR Questions: """ diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml index cdf7f731e..8dc96dc7b 100644 --- a/pr_agent/settings/pr_reviewer_prompts.toml +++ b/pr_agent/settings/pr_reviewer_prompts.toml @@ -1,167 +1,216 @@ [pr_review_prompt] -system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests. -Your task is to provide constructive and concise feedback for the PR, and also provide meaningfull code suggestions to improve the new PR code (the '+' lines). +system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). {%- if num_code_suggestions > 0 %} -- Provide up to {{ num_code_suggestions }} code suggestions. -- Try to focus on the most important suggestions, like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. -- Suggestions should focus on improving the new added code lines. -- Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines). +Your task is to provide constructive and concise feedback for the PR, and also provide meaningful code suggestions. +{%- else %} +Your task is to provide constructive and concise feedback for the PR. +{%- endif %} +The review should focus on new code added in the PR code diff (lines starting with '+') + + +The format we will use to present the PR code diff: +====== +## File: 'src/file1.py' +{%- if is_ai_metadata %} +### AI-generated changes summary: +* ... +* ... +{%- endif %} + + +@@ ... @@ def func1(): +__new hunk__ +11 unchanged code line0 in the PR +12 unchanged code line1 in the PR +13 +new code line2 added in the PR +14 unchanged code line3 in the PR +__old hunk__ + unchanged code line0 + unchanged code line1 +-old code line2 removed in the PR + unchanged code line3 + +@@ ... @@ def func2(): +__new hunk__ +... +__old hunk__ +... + + +## File: 'src/file2.py' +... +====== + +- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented. +- We also added line numbers for the '__new hunk__' code, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and should only used for reference. +- Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \ + The review should address new code added in the PR code diff (lines starting with '+') +{%- if is_ai_metadata %} +- If available, an AI-generated summary will appear and provide a high-level overview of the file changes. Note that this summary may not be fully accurate or complete. +{%- endif %} +- When quoting variables or names from the code, use backticks (`) instead of single quote ('). + +{%- if num_code_suggestions > 0 %} + + +Code suggestions guidelines: +- Provide up to {{ num_code_suggestions }} code suggestions. Try to provide diverse and insightful suggestions. +- Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices. +- Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs, or change a variable to const, or anything else, make sure it isn't already in the PR code. +- Don't suggest to add docstring, type hints, or comments. +- Suggestions should address the new code added in the PR diff (lines starting with '+') {%- endif %} -- If needed, each YAML output should be in block scalar format ('|-') {%- if extra_instructions %} + Extra instructions from the user: +====== {{ extra_instructions }} +====== {% endif %} -You must use the following YAML schema to format your answer: -```yaml -PR Analysis: - Main theme: - type: string - description: a short explanation of the PR - Type of PR: - type: string - enum: - - Bug fix - - Tests - - Refactoring - - Enhancement - - Documentation - - Other + +The output must be a YAML object equivalent to type $PRReview, according to the following Pydantic definitions: +===== +{%- if require_can_be_split_review %} +class SubPR(BaseModel): + relevant_files: List[str] = Field(description="The relevant files of the sub-PR") + title: str = Field(description="Short and concise title for an independent and meaningful sub-PR, composed only from the relevant files") +{%- endif %} + +class KeyIssuesComponentLink(BaseModel): + relevant_file: str = Field(description="The full file path of the relevant file") + issue_header: str = Field(description="one or two word title for the the issue. For example: 'Possible Bug', 'Performance Issue', 'Code Smell', etc.") + issue_content: str = Field(description="a short and concise description of the issue that needs to be reviewed") + start_line: int = Field(description="the start line that corresponds to this issue in the relevant file") + end_line: int = Field(description="the end line that corresponds to this issue in the relevant file") + +class Review(BaseModel): +{%- if require_estimate_effort_to_review %} + estimated_effort_to_review_[1-5]: int = Field(description="Estimate, on a scale of 1-5 (inclusive), the time and effort required to review this PR by an experienced and knowledgeable developer. 1 means short and easy review , 5 means long and hard review. Take into account the size, complexity, quality, and the needed changes of the PR code diff.") +{%- endif %} {%- if require_score %} - Score: - type: int - description: >- - Rate this PR on a scale of 0-100 (inclusive), where 0 means the worst - possible PR code, and 100 means PR code of the highest quality, without - any bugs or performance issues, that is ready to be merged immediately and - run in production at scale. + score: str = Field(description="Rate this PR on a scale of 0-100 (inclusive), where 0 means the worst possible PR code, and 100 means PR code of the highest quality, without any bugs or performance issues, that is ready to be merged immediately and run in production at scale.") {%- endif %} {%- if require_tests %} - Relevant tests added: - type: string - description: yes\\no question: does this PR have relevant tests ? + relevant_tests: str = Field(description="yes\\no question: does this PR have relevant tests added or updated ?") {%- endif %} {%- if question_str %} - Insights from user's answer: - type: string - description: >- - shortly summarize the insights you gained from the user's answers to the questions -{%- endif %} -{%- if require_focused %} - Focused PR: - type: string - description: >- - Is this a focused PR, in the sense that all the PR code diff changes are - united under a single focused theme ? If the theme is too broad, or the PR - code diff changes are too scattered, then the PR is not focused. Explain - your answer shortly. -{%- endif %} -PR Feedback: - General suggestions: - type: string - description: >- - General suggestions and feedback for the contributors and maintainers of - this PR. May include important suggestions for the overall structure, - primary purpose, best practices, critical bugs, and other aspects of the - PR. Don't address PR title and description, or lack of tests. Explain your - suggestions. + insights_from_user_answers: str = Field(description="shortly summarize the insights you gained from the user's answers to the questions") +{%- endif %} + key_issues_to_review: List[KeyIssuesComponentLink] = Field("A list of bugs, issue or major performance concerns introduced in this PR, which the PR reviewer should further investigate") +{%- if require_security_review %} + security_concerns: str = Field(description="Does this PR code introduce possible vulnerabilities such as exposure of sensitive information (e.g., API keys, secrets, passwords), or security concerns like SQL injection, XSS, CSRF, and others ? Answer 'No' (without explaining why) if there are no possible issues. If there are security concerns or issues, start your answer with a short header, such as: 'Sensitive information exposure: ...', 'SQL injection: ...' etc. Explain your answer. Be specific and give examples if possible") +{%- endif %} +{%- if require_can_be_split_review %} + can_be_split: List[SubPR] = Field(min_items=0, max_items=3, description="Can this PR, which contains {{ num_pr_files }} changed files in total, be divided into smaller sub-PRs with distinct tasks that can be reviewed and merged independently, regardless of the order ? Make sure that the sub-PRs are indeed independent, with no code dependencies between them, and that each sub-PR represent a meaningful independent task. Output an empty list if the PR code does not need to be split.") +{%- endif %} {%- if num_code_suggestions > 0 %} - Code feedback: - type: array - maxItems: {{ num_code_suggestions }} - uniqueItems: true - items: - relevant file: - type: string - description: the relevant file full path - suggestion: - type: string - description: | - a concrete suggestion for meaningfully improving the new PR code. Also - describe how, specifically, the suggestion can be applied to new PR - code. Add tags with importance measure that matches each suggestion - ('important' or 'medium'). Do not make suggestions for updating or - adding docstrings, renaming PR title and description, or linter like. - relevant line: - type: string - description: | - a single code line taken from the relevant file, to which the suggestion applies. - The line should be a '+' line. - Make sure to output the line exactly as it appears in the relevant file -{%- endif %} -{%- if require_security %} - Security concerns: - type: string - description: >- - yes\\no question: does this PR code introduce possible security concerns or - issues, like SQL injection, XSS, CSRF, and others ? If answered 'yes',explain your answer shortly + +class CodeSuggestion(BaseModel): + relevant_file: str = Field(description="The full file path of the relevant file") + language: str = Field(description="The programming language of the relevant file") + suggestion: str = Field(description="a concrete suggestion for meaningfully improving the new PR code. Also describe how, specifically, the suggestion can be applied to new PR code. Add tags with importance measure that matches each suggestion ('important' or 'medium'). Do not make suggestions for updating or adding docstrings, renaming PR title and description, or linter like.") + relevant_line: str = Field(description="a single code line taken from the relevant file, to which the suggestion applies. The code line should start with a '+'. Make sure to output the line exactly as it appears in the relevant file") {%- endif %} -``` +{%- if num_code_suggestions > 0 %} + +class PRReview(BaseModel): + review: Review + code_feedback: List[CodeSuggestion] +{%- else %} + + +class PRReview(BaseModel): + review: Review +{%- endif %} +===== + Example output: ```yaml -PR Analysis: - Main theme: xxx - Type of PR: Bug fix -{%- if require_score %} - Score: 89 +review: +{%- if require_estimate_effort_to_review %} + estimated_effort_to_review_[1-5]: | + 3 {%- endif %} - Relevant tests added: No -{%- if require_focused %} - Focused PR: no, because ... +{%- if require_score %} + score: 89 {%- endif %} -PR Feedback: - General PR suggestions: ... -{%- if num_code_suggestions > 0 %} - Code feedback: - - relevant file: |- + relevant_tests: | + No + key_issues_to_review: + - relevant_file: | directory/xxx.py - suggestion: xxx [important] - relevant line: |- - xxx - ... + issue_header: | + Possible Bug + issue_content: | + ... + start_line: 12 + end_line: 14 + - ... + security_concerns: | + No +{%- if require_can_be_split_review %} + can_be_split: + - relevant_files: + - ... + - ... + title: ... + - ... {%- endif %} -{%- if require_security %} - Security concerns: No + +{%- if num_code_suggestions > 0 %} +code_feedback: +- relevant_file: | + directory/xxx.py + language: | + python + suggestion: | + xxx [important] + relevant_line: | + xxx {%- endif %} ``` -Make sure to output a valid YAML. Use multi-line block scalar ('|') if needed. -Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ -user="""PR Info: +user="""--PR Info-- + Title: '{{title}}' + Branch: '{{branch}}' -Description: '{{description}}' -{%- if language %} -Main language: {{language}} -{%- endif %} -{%- if commit_messages_str %} -Commit messages: -{{commit_messages_str}} +{%- if description %} + +PR Description: +====== +{{ description|trim }} +====== {%- endif %} {%- if question_str %} -###### + +===== Here are questions to better understand the PR. Use the answers to provide better feedback. -{{question_str|trim}} +{{ question_str|trim }} User answers: -{{answer_str|trim}} -###### +' +{{ answer_str|trim }} +' +===== {%- endif %} -The PR Git Diff: -``` -{{diff}} -``` -Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines. + +The PR code diff: +====== +{{ diff|trim }} +====== + Response (should be a valid YAML, and nothing else): ```yaml diff --git a/pr_agent/settings/pr_sort_code_suggestions_prompts.toml b/pr_agent/settings/pr_sort_code_suggestions_prompts.toml new file mode 100644 index 000000000..33599ba39 --- /dev/null +++ b/pr_agent/settings/pr_sort_code_suggestions_prompts.toml @@ -0,0 +1,46 @@ +[pr_sort_code_suggestions_prompt] +system=""" +""" + +user="""You are given a list of code suggestions to improve a Git Pull Request (PR): +====== +{{ suggestion_str|trim }} +====== + +Your task is to sort the code suggestions by their order of importance, and return a list with sorting order. +The sorting order is a list of pairs, where each pair contains the index of the suggestion in the original list. +Rank the suggestions based on their importance to improving the PR, with critical issues first and minor issues last. + +You must use the following YAML schema to format your answer: +```yaml +Sort Order: + type: array + maxItems: {{ suggestion_list|length }} + uniqueItems: true + items: + suggestion number: + type: integer + minimum: 1 + maximum: {{ suggestion_list|length }} + importance order: + type: integer + minimum: 1 + maximum: {{ suggestion_list|length }} +``` + +Example output: +```yaml +Sort Order: + - suggestion number: 1 + importance order: 2 + - suggestion number: 2 + importance order: 3 + - suggestion number: 3 + importance order: 1 +``` + +Make sure to output a valid YAML. Use multi-line block scalar ('|') if needed. +Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/settings/pr_update_changelog_prompts.toml b/pr_agent/settings/pr_update_changelog_prompts.toml index 78b6a0b53..121f43a0f 100644 --- a/pr_agent/settings/pr_update_changelog_prompts.toml +++ b/pr_agent/settings/pr_update_changelog_prompts.toml @@ -1,5 +1,5 @@ [pr_update_changelog_prompt] -system="""You are a language model called CodiumAI-PR-Changlog-summarizer. +system="""You are a language model called PR-Changelog-Updater. Your task is to update the CHANGELOG.md file of the project, to shortly summarize important changes introduced in this PR (the '+' lines). - The output should match the existing CHANGELOG.md format, style and conventions, so it will look like a natural part of the file. For example, if previous changes were summarized in a single line, you should do the same. - Don't repeat previous changes. Generate only new content, that is not already in the CHANGELOG.md file. @@ -8,28 +8,44 @@ Your task is to update the CHANGELOG.md file of the project, to shortly summariz {%- if extra_instructions %} Extra instructions from the user: -{{ extra_instructions }} +====== +{{ extra_instructions|trim }} +====== {%- endif %} """ user="""PR Info: + Title: '{{title}}' + Branch: '{{branch}}' -Description: '{{description}}' + +{%- if description %} + +Description: +====== +{{ description|trim }} +====== +{%- endif %} + {%- if language %} -Main language: {{language}} + +Main PR language: '{{ language }}' {%- endif %} {%- if commit_messages_str %} + Commit messages: -{{commit_messages_str}} +====== +{{ commit_messages_str|trim }} +====== {%- endif %} -The PR Diff: -``` -{{diff}} -``` +The PR Git Diff: +====== +{{ diff|trim }} +====== Current date: ``` @@ -37,9 +53,10 @@ Current date: ``` The current CHANGELOG.md: -``` +====== {{ changelog_file_str }} -``` +====== + Response: """ diff --git a/pr_agent/tools/pr_add_docs.py b/pr_agent/tools/pr_add_docs.py new file mode 100644 index 000000000..3ec97b31c --- /dev/null +++ b/pr_agent/tools/pr_add_docs.py @@ -0,0 +1,180 @@ +import copy +import textwrap +from functools import partial +from typing import Dict + +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import load_yaml +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger + + +class PRAddDocs: + def __init__(self, pr_url: str, cli_mode=False, args: list = None, + ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): + + self.git_provider = get_git_provider()(pr_url) + self.main_language = get_main_pr_language( + self.git_provider.get_languages(), self.git_provider.get_files() + ) + + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_language + + self.patches_diff = None + self.prediction = None + self.cli_mode = cli_mode + self.vars = { + "title": self.git_provider.pr.title, + "branch": self.git_provider.get_pr_branch(), + "description": self.git_provider.get_pr_description(), + "language": self.main_language, + "diff": "", # empty diff for initial calculation + "extra_instructions": get_settings().pr_add_docs.extra_instructions, + "commit_messages_str": self.git_provider.get_commit_messages(), + 'docs_for_language': get_docs_for_language(self.main_language, + get_settings().pr_add_docs.docs_style), + } + self.token_handler = TokenHandler(self.git_provider.pr, + self.vars, + get_settings().pr_add_docs_prompt.system, + get_settings().pr_add_docs_prompt.user) + + async def run(self): + try: + get_logger().info('Generating code Docs for PR...') + if get_settings().config.publish_output: + self.git_provider.publish_comment("Generating Documentation...", is_temporary=True) + + get_logger().info('Preparing PR documentation...') + await retry_with_fallback_models(self._prepare_prediction) + data = self._prepare_pr_code_docs() + if (not data) or (not 'Code Documentation' in data): + get_logger().info('No code documentation found for PR.') + return + + if get_settings().config.publish_output: + get_logger().info('Pushing PR documentation...') + self.git_provider.remove_initial_comment() + get_logger().info('Pushing inline code documentation...') + self.push_inline_docs(data) + except Exception as e: + get_logger().error(f"Failed to generate code documentation for PR, error: {e}") + + async def _prepare_prediction(self, model: str): + get_logger().info('Getting PR diff...') + + self.patches_diff = get_pr_diff(self.git_provider, + self.token_handler, + model, + add_line_numbers_to_hunks=True, + disable_extra_lines=False) + + get_logger().info('Getting AI prediction...') + self.prediction = await self._get_prediction(model) + + async def _get_prediction(self, model: str): + variables = copy.deepcopy(self.vars) + variables["diff"] = self.patches_diff # update diff + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_add_docs_prompt.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_add_docs_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"\nSystem prompt:\n{system_prompt}") + get_logger().info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion( + model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) + + return response + + def _prepare_pr_code_docs(self) -> Dict: + docs = self.prediction.strip() + data = load_yaml(docs) + if isinstance(data, list): + data = {'Code Documentation': data} + return data + + def push_inline_docs(self, data): + docs = [] + + if not data['Code Documentation']: + return self.git_provider.publish_comment('No code documentation found to improve this PR.') + + for d in data['Code Documentation']: + try: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"add_docs: {d}") + relevant_file = d['relevant file'].strip() + relevant_line = int(d['relevant line']) # absolute position + documentation = d['documentation'] + doc_placement = d['doc placement'].strip() + if documentation: + new_code_snippet = self.dedent_code(relevant_file, relevant_line, documentation, doc_placement, + add_original_line=True) + + body = f"**Suggestion:** Proposed documentation\n```suggestion\n" + new_code_snippet + "\n```" + docs.append({'body': body, 'relevant_file': relevant_file, + 'relevant_lines_start': relevant_line, + 'relevant_lines_end': relevant_line}) + except Exception: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Could not parse code docs: {d}") + + is_successful = self.git_provider.publish_code_suggestions(docs) + if not is_successful: + get_logger().info("Failed to publish code docs, trying to publish each docs separately") + for doc_suggestion in docs: + self.git_provider.publish_code_suggestions([doc_suggestion]) + + def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet, doc_placement='after', + add_original_line=False): + try: # dedent code snippet + self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ + else self.git_provider.get_diff_files() + original_initial_line = None + for file in self.diff_files: + if file.filename.strip() == relevant_file: + original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] + break + if original_initial_line: + if doc_placement == 'after': + line = file.head_file.splitlines()[relevant_lines_start] + else: + line = original_initial_line + suggested_initial_line = new_code_snippet.splitlines()[0] + original_initial_spaces = len(line) - len(line.lstrip()) + suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) + delta_spaces = original_initial_spaces - suggested_initial_spaces + if delta_spaces > 0: + new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') + if add_original_line: + if doc_placement == 'after': + new_code_snippet = original_initial_line + "\n" + new_code_snippet + else: + new_code_snippet = new_code_snippet.rstrip() + "\n" + original_initial_line + except Exception as e: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") + + return new_code_snippet + + +def get_docs_for_language(language, style): + language = language.lower() + if language == 'java': + return "Javadocs" + elif language in ['python', 'lisp', 'clojure']: + return f"Docstring ({style})" + elif language in ['javascript', 'typescript']: + return "JSdocs" + elif language == 'c++': + return "Doxygen" + else: + return "Docs" diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index a235852ed..ff9a9a2f1 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -1,123 +1,469 @@ +import asyncio import copy -import json -import logging import textwrap - +from functools import partial +from typing import Dict, List from jinja2 import Environment, StrictUndefined -from pr_agent.algo.ai_handler import AiHandler -from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.pr_processing import get_pr_diff, get_pr_multi_diffs, retry_with_fallback_models, \ + add_ai_metadata_to_diff_files from pr_agent.algo.token_handler import TokenHandler -from pr_agent.algo.utils import try_fix_json +from pr_agent.algo.utils import load_yaml, replace_code_tags, ModelType, show_relevant_configurations from pr_agent.config_loader import get_settings -from pr_agent.git_providers import BitbucketProvider, get_git_provider +from pr_agent.git_providers import get_git_provider, get_git_provider_with_context, GithubProvider, GitLabProvider, \ + AzureDevopsProvider from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger +from pr_agent.servers.help import HelpMessage +from pr_agent.tools.pr_description import insert_br_after_x_chars +import difflib +import re class PRCodeSuggestions: - def __init__(self, pr_url: str, cli_mode=False, args: list = None): + def __init__(self, pr_url: str, cli_mode=False, args: list = None, + ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): - self.git_provider = get_git_provider()(pr_url) + self.git_provider = get_git_provider_with_context(pr_url) self.main_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) - self.ai_handler = AiHandler() + # limit context specifically for the improve command, which has hard input to parse: + if get_settings().pr_code_suggestions.max_context_tokens: + MAX_CONTEXT_TOKENS_IMPROVE = get_settings().pr_code_suggestions.max_context_tokens + if get_settings().config.max_model_tokens > MAX_CONTEXT_TOKENS_IMPROVE: + get_logger().info(f"Setting max_model_tokens to {MAX_CONTEXT_TOKENS_IMPROVE} for PR improve") + get_settings().config.max_model_tokens_original = get_settings().config.max_model_tokens + get_settings().config.max_model_tokens = MAX_CONTEXT_TOKENS_IMPROVE + + # extended mode + try: + self.is_extended = self._get_is_extended(args or []) + except: + self.is_extended = False + num_code_suggestions = get_settings().pr_code_suggestions.num_code_suggestions_per_chunk + + + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_language self.patches_diff = None self.prediction = None + self.pr_url = pr_url self.cli_mode = cli_mode + self.pr_description, self.pr_description_files = ( + self.git_provider.get_pr_description(split_changes_walkthrough=True)) + if (self.pr_description_files and get_settings().get("config.is_auto_command", False) and + get_settings().get("config.enable_ai_metadata", False)): + add_ai_metadata_to_diff_files(self.git_provider, self.pr_description_files) + get_logger().debug(f"AI metadata added to the this command") + else: + get_settings().set("config.enable_ai_metadata", False) + get_logger().debug(f"AI metadata is disabled for this command") + self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), - "description": self.git_provider.get_pr_description(), + "description": self.pr_description, "language": self.main_language, "diff": "", # empty diff for initial calculation - "num_code_suggestions": get_settings().pr_code_suggestions.num_code_suggestions, + "num_code_suggestions": num_code_suggestions, "extra_instructions": get_settings().pr_code_suggestions.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), + "relevant_best_practices": "", + "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), } + self.pr_code_suggestions_prompt_system = get_settings().pr_code_suggestions_prompt.system + self.token_handler = TokenHandler(self.git_provider.pr, self.vars, - get_settings().pr_code_suggestions_prompt.system, + self.pr_code_suggestions_prompt_system, get_settings().pr_code_suggestions_prompt.user) + self.progress = f"## Generating PR code suggestions\n\n" + self.progress += f"""\nWork in progress ...<br>\n<img src="https://codium.ai/images/pr_agent/dual_ball_loading-crop.gif" width=48>""" + self.progress_response = None + async def run(self): - assert type(self.git_provider) != BitbucketProvider, "Bitbucket is not supported for now" - - logging.info('Generating code suggestions for PR...') - if get_settings().config.publish_output: - self.git_provider.publish_comment("Preparing review...", is_temporary=True) - await retry_with_fallback_models(self._prepare_prediction) - logging.info('Preparing PR review...') - data = self._prepare_pr_code_suggestions() - if get_settings().config.publish_output: - logging.info('Pushing PR review...') - self.git_provider.remove_initial_comment() - logging.info('Pushing inline code comments...') - self.push_inline_code_suggestions(data) - - async def _prepare_prediction(self, model: str): - logging.info('Getting PR diff...') + try: + if not self.git_provider.get_files(): + get_logger().info(f"PR has no files: {self.pr_url}, skipping code suggestions") + return None + + get_logger().info('Generating code suggestions for PR...') + relevant_configs = {'pr_code_suggestions': dict(get_settings().pr_code_suggestions), + 'config': dict(get_settings().config)} + get_logger().debug("Relevant configs", artifacts=relevant_configs) + if (get_settings().config.publish_output and get_settings().config.publish_output_progress and + not get_settings().config.get('is_auto_command', False)): + if self.git_provider.is_supported("gfm_markdown"): + self.progress_response = self.git_provider.publish_comment(self.progress) + else: + self.git_provider.publish_comment("Preparing suggestions...", is_temporary=True) + + if not self.is_extended: + data = await retry_with_fallback_models(self._prepare_prediction) + else: + data = await retry_with_fallback_models(self._prepare_prediction_extended) + if not data: + data = {"code_suggestions": []} + + if (data is None or 'code_suggestions' not in data or not data['code_suggestions'] + and get_settings().config.publish_output): + get_logger().warning('No code suggestions found for the PR.') + pr_body = "## PR Code Suggestions โœจ\n\nNo code suggestions found for the PR." + get_logger().debug(f"PR output", artifact=pr_body) + if self.progress_response: + self.git_provider.edit_comment(self.progress_response, body=pr_body) + else: + self.git_provider.publish_comment(pr_body) + return + + if (not self.is_extended and get_settings().pr_code_suggestions.rank_suggestions) or \ + (self.is_extended and get_settings().pr_code_suggestions.rank_extended_suggestions): + get_logger().info('Ranking Suggestions...') + data['code_suggestions'] = await self.rank_suggestions(data['code_suggestions']) + + if get_settings().config.publish_output: + self.git_provider.remove_initial_comment() + if ((not get_settings().pr_code_suggestions.commitable_code_suggestions) and + self.git_provider.is_supported("gfm_markdown")): + + # generate summarized suggestions + pr_body = self.generate_summarized_suggestions(data) + get_logger().debug(f"PR output", artifact=pr_body) + + # require self-review + if get_settings().pr_code_suggestions.demand_code_suggestions_self_review: + text = get_settings().pr_code_suggestions.code_suggestions_self_review_text + pr_body += f"\n\n- [ ] {text}" + if get_settings().pr_code_suggestions.approve_pr_on_self_review: + pr_body += ' <!-- approve pr self-review -->' + + # add usage guide + if (get_settings().pr_code_suggestions.enable_chat_text and get_settings().config.is_auto_command + and isinstance(self.git_provider, GithubProvider)): + pr_body += "\n\n>๐Ÿ’ก Need additional feedback ? start a [PR chat](https://chromewebstore.google.com/detail/ephlnjeghhogofkifjloamocljapahnl) \n\n" + if get_settings().pr_code_suggestions.enable_help_text: + pr_body += "<hr>\n\n<details> <summary><strong>๐Ÿ’ก Tool usage guide:</strong></summary><hr> \n\n" + pr_body += HelpMessage.get_improve_usage_guide() + pr_body += "\n</details>\n" + + + # Output the relevant configurations if enabled + if get_settings().get('config', {}).get('output_relevant_configurations', False): + pr_body += show_relevant_configurations(relevant_section='pr_code_suggestions') + + if get_settings().pr_code_suggestions.persistent_comment: + final_update_message = False + self.publish_persistent_comment_with_history(pr_body, + initial_header="## PR Code Suggestions โœจ", + update_header=True, + name="suggestions", + final_update_message=final_update_message, + max_previous_comments=get_settings().pr_code_suggestions.max_history_len, + progress_response=self.progress_response) + else: + if self.progress_response: + self.git_provider.edit_comment(self.progress_response, body=pr_body) + else: + self.git_provider.publish_comment(pr_body) + + else: + self.push_inline_code_suggestions(data) + if self.progress_response: + self.git_provider.remove_comment(self.progress_response) + else: + get_logger().info('Code suggestions generated for PR, but not published since publish_output is False.') + except Exception as e: + get_logger().error(f"Failed to generate code suggestions for PR, error: {e}") + if get_settings().config.publish_output: + if self.progress_response: + self.progress_response.delete() + else: + try: + self.git_provider.remove_initial_comment() + self.git_provider.publish_comment(f"Failed to generate code suggestions for PR") + except Exception as e: + pass + + def publish_persistent_comment_with_history(self, pr_comment: str, + initial_header: str, + update_header: bool = True, + name='review', + final_update_message=True, + max_previous_comments=4, + progress_response=None): + + if isinstance(self.git_provider, AzureDevopsProvider): # get_latest_commit_url is not supported yet + if progress_response: + self.git_provider.edit_comment(progress_response, pr_comment) + else: + self.git_provider.publish_comment(pr_comment) + return + + history_header = f"#### Previous suggestions\n" + last_commit_num = self.git_provider.get_latest_commit_url().split('/')[-1][:7] + latest_suggestion_header = f"Latest suggestions up to {last_commit_num}" + latest_commit_html_comment = f"<!-- {last_commit_num} -->" + found_comment = None + + if max_previous_comments > 0: + try: + prev_comments = list(self.git_provider.get_issue_comments()) + for comment in prev_comments: + if comment.body.startswith(initial_header): + prev_suggestions = comment.body + found_comment = comment + comment_url = self.git_provider.get_comment_url(comment) + + if history_header.strip() not in comment.body: + # no history section + # extract everything between <table> and </table> in comment.body including <table> and </table> + table_index = comment.body.find("<table>") + if table_index == -1: + self.git_provider.edit_comment(comment, pr_comment) + continue + # find http link from comment.body[:table_index] + up_to_commit_txt = self.extract_link(comment.body[:table_index]) + prev_suggestion_table = comment.body[ + table_index:comment.body.rfind("</table>") + len("</table>")] + + tick = "โœ… " if "โœ…" in prev_suggestion_table else "" + # surround with details tag + prev_suggestion_table = f"<details><summary>{tick}{name.capitalize()}{up_to_commit_txt}</summary>\n<br>{prev_suggestion_table}\n\n</details>" + + new_suggestion_table = pr_comment.replace(initial_header, "").strip() + + pr_comment_updated = f"{initial_header}\n{latest_commit_html_comment}\n\n" + pr_comment_updated += f"{latest_suggestion_header}\n{new_suggestion_table}\n\n___\n\n" + pr_comment_updated += f"{history_header}{prev_suggestion_table}\n" + else: + # get the text of the previous suggestions until the latest commit + sections = prev_suggestions.split(history_header.strip()) + latest_table = sections[0].strip() + prev_suggestion_table = sections[1].replace(history_header, "").strip() + + # get text after the latest_suggestion_header in comment.body + table_ind = latest_table.find("<table>") + up_to_commit_txt = self.extract_link(latest_table[:table_ind]) + + latest_table = latest_table[table_ind:latest_table.rfind("</table>") + len("</table>")] + # enforce max_previous_comments + count = prev_suggestions.count(f"\n<details><summary>{name.capitalize()}") + count += prev_suggestions.count(f"\n<details><summary>โœ… {name.capitalize()}") + if count >= max_previous_comments: + # remove the oldest suggestion + prev_suggestion_table = prev_suggestion_table[:prev_suggestion_table.rfind( + f"<details><summary>{name.capitalize()} up to commit")] + + tick = "โœ… " if "โœ…" in latest_table else "" + # Add to the prev_suggestions section + last_prev_table = f"\n<details><summary>{tick}{name.capitalize()}{up_to_commit_txt}</summary>\n<br>{latest_table}\n\n</details>" + prev_suggestion_table = last_prev_table + "\n" + prev_suggestion_table + + new_suggestion_table = pr_comment.replace(initial_header, "").strip() + + pr_comment_updated = f"{initial_header}\n" + pr_comment_updated += f"{latest_commit_html_comment}\n\n" + pr_comment_updated += f"{latest_suggestion_header}\n\n{new_suggestion_table}\n\n" + pr_comment_updated += "___\n\n" + pr_comment_updated += f"{history_header}\n" + pr_comment_updated += f"{prev_suggestion_table}\n" + + get_logger().info(f"Persistent mode - updating comment {comment_url} to latest {name} message") + if progress_response: # publish to 'progress_response' comment, because it refreshes immediately + self.git_provider.edit_comment(progress_response, pr_comment_updated) + self.git_provider.remove_comment(comment) + else: + self.git_provider.edit_comment(comment, pr_comment_updated) + return + except Exception as e: + get_logger().exception(f"Failed to update persistent review, error: {e}") + pass + + # if we are here, we did not find a previous comment to update + body = pr_comment.replace(initial_header, "").strip() + pr_comment = f"{initial_header}\n\n{latest_commit_html_comment}\n\n{body}\n\n" + if progress_response: + self.git_provider.edit_comment(progress_response, pr_comment) + else: + self.git_provider.publish_comment(pr_comment) + + def extract_link(self, s): + r = re.compile(r"<!--.*?-->") + match = r.search(s) + + up_to_commit_txt = "" + if match: + up_to_commit_txt = f" up to commit {match.group(0)[4:-3].strip()}" + return up_to_commit_txt + + async def _prepare_prediction(self, model: str) -> dict: self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model, add_line_numbers_to_hunks=True, - disable_extra_lines=True) + disable_extra_lines=False) - logging.info('Getting AI prediction...') - self.prediction = await self._get_prediction(model) + if self.patches_diff: + get_logger().debug(f"PR diff", artifact=self.patches_diff) + self.prediction = await self._get_prediction(model, self.patches_diff) + else: + get_logger().warning(f"Empty PR diff") + self.prediction = None - async def _get_prediction(self, model: str): + data = self.prediction + return data + + async def _get_prediction(self, model: str, patches_diff: str) -> dict: variables = copy.deepcopy(self.vars) - variables["diff"] = self.patches_diff # update diff + variables["diff"] = patches_diff # update diff environment = Environment(undefined=StrictUndefined) - system_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.system).render(variables) + system_prompt = environment.from_string(self.pr_code_suggestions_prompt_system).render(variables) user_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, - system=system_prompt, user=user_prompt) + response, finish_reason = await self.ai_handler.chat_completion( + model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) - return response + # load suggestions from the AI response + data = self._prepare_pr_code_suggestions(response) + + # self-reflect on suggestions + if get_settings().pr_code_suggestions.self_reflect_on_suggestions: + model_turbo = get_settings().config.model_turbo # use turbo model for self-reflection, since it is an easier task + response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], + patches_diff, model=model_turbo) + if response_reflect: + response_reflect_yaml = load_yaml(response_reflect) + code_suggestions_feedback = response_reflect_yaml["code_suggestions"] + if len(code_suggestions_feedback) == len(data["code_suggestions"]): + for i, suggestion in enumerate(data["code_suggestions"]): + try: + suggestion["score"] = code_suggestions_feedback[i]["suggestion_score"] + suggestion["score_why"] = code_suggestions_feedback[i]["why"] + except Exception as e: # + get_logger().error(f"Error processing suggestion score {i}", + artifact={"suggestion": suggestion, + "code_suggestions_feedback": code_suggestions_feedback[i]}) + suggestion["score"] = 7 + suggestion["score_why"] = "" + else: + # get_logger().error(f"Could not self-reflect on suggestions. using default score 7") + for i, suggestion in enumerate(data["code_suggestions"]): + suggestion["score"] = 7 + suggestion["score_why"] = "" + + return data + + @staticmethod + def _truncate_if_needed(suggestion): + max_code_suggestion_length = get_settings().get("PR_CODE_SUGGESTIONS.MAX_CODE_SUGGESTION_LENGTH", 0) + suggestion_truncation_message = get_settings().get("PR_CODE_SUGGESTIONS.SUGGESTION_TRUNCATION_MESSAGE", "") + if max_code_suggestion_length > 0: + if len(suggestion['improved_code']) > max_code_suggestion_length: + suggestion['improved_code'] = suggestion['improved_code'][:max_code_suggestion_length] + suggestion['improved_code'] += f"\n{suggestion_truncation_message}" + get_logger().info(f"Truncated suggestion from {len(suggestion['improved_code'])} " + f"characters to {max_code_suggestion_length} characters") + return suggestion + + def _prepare_pr_code_suggestions(self, predictions: str) -> Dict: + data = load_yaml(predictions.strip(), + keys_fix_yaml=["relevant_file", "suggestion_content", "existing_code", "improved_code"], + first_key="code_suggestions", last_key="label") + if isinstance(data, list): + data = {'code_suggestions': data} + + # remove or edit invalid suggestions + suggestion_list = [] + one_sentence_summary_list = [] + for i, suggestion in enumerate(data['code_suggestions']): + try: + needed_keys = ['one_sentence_summary', 'label', 'relevant_file', 'relevant_lines_start', + 'relevant_lines_end'] + is_valid_keys = True + for key in needed_keys: + if key not in suggestion: + is_valid_keys = False + get_logger().debug( + f"Skipping suggestion {i + 1}, because it does not contain '{key}':\n'{suggestion}") + break + if not is_valid_keys: + continue + + if suggestion['one_sentence_summary'] in one_sentence_summary_list: + get_logger().debug(f"Skipping suggestion {i + 1}, because it is a duplicate: {suggestion}") + continue + + if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion[ + 'suggestion_content'] and 'let' in suggestion['suggestion_content']: + get_logger().debug( + f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}") + continue + + if ('existing_code' in suggestion) and ('improved_code' in suggestion): + if suggestion['existing_code'] == suggestion['improved_code']: + get_logger().debug( + f"edited improved suggestion {i + 1}, because equal to existing code: {suggestion['existing_code']}") + if get_settings().pr_code_suggestions.commitable_code_suggestions: + suggestion['improved_code'] = "" # we need 'existing_code' to locate the code in the PR + else: + suggestion['existing_code'] = "" + suggestion = self._truncate_if_needed(suggestion) + one_sentence_summary_list.append(suggestion['one_sentence_summary']) + suggestion_list.append(suggestion) + else: + get_logger().info( + f"Skipping suggestion {i + 1}, because it does not contain 'existing_code' or 'improved_code': {suggestion}") + except Exception as e: + get_logger().error(f"Error processing suggestion {i + 1}: {suggestion}, error: {e}") + data['code_suggestions'] = suggestion_list - def _prepare_pr_code_suggestions(self) -> str: - review = self.prediction.strip() - try: - data = json.loads(review) - except json.decoder.JSONDecodeError: - if get_settings().config.verbosity_level >= 2: - logging.info(f"Could not parse json response: {review}") - data = try_fix_json(review, code_suggestions=True) return data def push_inline_code_suggestions(self, data): code_suggestions = [] - for d in data['Code suggestions']: + + if not data['code_suggestions']: + get_logger().info('No suggestions found to improve this PR.') + if self.progress_response: + return self.git_provider.edit_comment(self.progress_response, + body='No suggestions found to improve this PR.') + else: + return self.git_provider.publish_comment('No suggestions found to improve this PR.') + + for d in data['code_suggestions']: try: if get_settings().config.verbosity_level >= 2: - logging.info(f"suggestion: {d}") - relevant_file = d['relevant file'].strip() - relevant_lines_str = d['relevant lines'].strip() - if ',' in relevant_lines_str: # handling 'relevant lines': '181, 190' or '178-184, 188-194' - relevant_lines_str = relevant_lines_str.split(',')[0] - relevant_lines_start = int(relevant_lines_str.split('-')[0]) # absolute position - relevant_lines_end = int(relevant_lines_str.split('-')[-1]) - content = d['suggestion content'] - new_code_snippet = d['improved code'] + get_logger().info(f"suggestion: {d}") + relevant_file = d['relevant_file'].strip() + relevant_lines_start = int(d['relevant_lines_start']) # absolute position + relevant_lines_end = int(d['relevant_lines_end']) + content = d['suggestion_content'].rstrip() + new_code_snippet = d['improved_code'].rstrip() + label = d['label'].strip() if new_code_snippet: new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) - body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```" + if d.get('score'): + body = f"**Suggestion:** {content} [{label}, importance: {d.get('score')}]\n```suggestion\n" + new_code_snippet + "\n```" + else: + body = f"**Suggestion:** {content} [{label}]\n```suggestion\n" + new_code_snippet + "\n```" code_suggestions.append({'body': body, 'relevant_file': relevant_file, 'relevant_lines_start': relevant_lines_start, - 'relevant_lines_end': relevant_lines_end}) + 'relevant_lines_end': relevant_lines_end, + 'original_suggestion': d}) except Exception: - if get_settings().config.verbosity_level >= 2: - logging.info(f"Could not parse suggestion: {d}") + get_logger().info(f"Could not parse suggestion: {d}") - self.git_provider.publish_code_suggestions(code_suggestions) + is_successful = self.git_provider.publish_code_suggestions(code_suggestions) + if not is_successful: + get_logger().info("Failed to publish code suggestions, trying to publish each suggestion separately") + for code_suggestion in code_suggestions: + self.git_provider.publish_code_suggestions([code_suggestion]) def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): try: # dedent code snippet @@ -126,7 +472,24 @@ def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): original_initial_line = None for file in self.diff_files: if file.filename.strip() == relevant_file: - original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] + if file.head_file: + file_lines = file.head_file.splitlines() + if relevant_lines_start > len(file_lines): + get_logger().warning( + "Could not dedent code snippet, because relevant_lines_start is out of range", + artifact={'filename': file.filename, + 'file_content': file.head_file, + 'relevant_lines_start': relevant_lines_start, + 'new_code_snippet': new_code_snippet}) + return new_code_snippet + else: + original_initial_line = file_lines[relevant_lines_start - 1] + else: + get_logger().warning("Could not dedent code snippet, because head_file is missing", + artifact={'filename': file.filename, + 'relevant_lines_start': relevant_lines_start, + 'new_code_snippet': new_code_snippet}) + return new_code_snippet break if original_initial_line: suggested_initial_line = new_code_snippet.splitlines()[0] @@ -136,8 +499,265 @@ def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): if delta_spaces > 0: new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') except Exception as e: - if get_settings().config.verbosity_level >= 2: - logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") + get_logger().error(f"Error when dedenting code snippet for file {relevant_file}, error: {e}") return new_code_snippet + def _get_is_extended(self, args: list[str]) -> bool: + """Check if extended mode should be enabled by the `--extended` flag or automatically according to the configuration""" + if any(["extended" in arg for arg in args]): + get_logger().info("Extended mode is enabled by the `--extended` flag") + return True + if get_settings().pr_code_suggestions.auto_extended_mode: + # get_logger().info("Extended mode is enabled automatically based on the configuration toggle") + return True + return False + + async def _prepare_prediction_extended(self, model: str) -> dict: + self.patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, + max_calls=get_settings().pr_code_suggestions.max_number_of_calls) + if self.patches_diff_list: + get_logger().info(f"Number of PR chunk calls: {len(self.patches_diff_list)}") + get_logger().debug(f"PR diff:", artifact=self.patches_diff_list) + + # parallelize calls to AI: + if get_settings().pr_code_suggestions.parallel_calls: + prediction_list = await asyncio.gather( + *[self._get_prediction(model, patches_diff) for patches_diff in self.patches_diff_list]) + self.prediction_list = prediction_list + else: + prediction_list = [] + for i, patches_diff in enumerate(self.patches_diff_list): + prediction = await self._get_prediction(model, patches_diff) + prediction_list.append(prediction) + + data = {"code_suggestions": []} + for j, predictions in enumerate(prediction_list): # each call adds an element to the list + if "code_suggestions" in predictions: + score_threshold = max(1, int(get_settings().pr_code_suggestions.suggestions_score_threshold)) + for i, prediction in enumerate(predictions["code_suggestions"]): + try: + if get_settings().pr_code_suggestions.self_reflect_on_suggestions: + score = int(prediction.get("score", 1)) + if score >= score_threshold: + data["code_suggestions"].append(prediction) + else: + get_logger().info( + f"Removing suggestions {i} from call {j}, because score is {score}, and score_threshold is {score_threshold}", + artifact=prediction) + else: + data["code_suggestions"].append(prediction) + except Exception as e: + get_logger().error(f"Error getting PR diff for suggestion {i} in call {j}, error: {e}") + self.data = data + else: + get_logger().warning(f"Empty PR diff list") + self.data = data = None + return data + + async def rank_suggestions(self, data: List) -> List: + """ + Call a model to rank (sort) code suggestions based on their importance order. + + Args: + data (List): A list of code suggestions to be ranked. + + Returns: + List: The ranked list of code suggestions. + """ + + suggestion_list = [] + if not data: + return suggestion_list + for suggestion in data: + suggestion_list.append(suggestion) + data_sorted = [[]] * len(suggestion_list) + + if len(suggestion_list) == 1: + return suggestion_list + + try: + suggestion_str = "" + for i, suggestion in enumerate(suggestion_list): + suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' + + variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} + model = get_settings().config.model + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render( + variables) + user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) + response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, + user=user_prompt) + + sort_order = load_yaml(response) + for s in sort_order['Sort Order']: + suggestion_number = s['suggestion number'] + importance_order = s['importance order'] + data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] + + if get_settings().pr_code_suggestions.final_clip_factor != 1: + max_len = max( + len(data_sorted), + get_settings().pr_code_suggestions.num_code_suggestions_per_chunk, + ) + new_len = int(0.5 + max_len * get_settings().pr_code_suggestions.final_clip_factor) + if new_len < len(data_sorted): + data_sorted = data_sorted[:new_len] + except Exception as e: + if get_settings().config.verbosity_level >= 1: + get_logger().info(f"Could not sort suggestions, error: {e}") + data_sorted = suggestion_list + + return data_sorted + + def generate_summarized_suggestions(self, data: Dict) -> str: + try: + pr_body = "## PR Code Suggestions โœจ\n\n" + + if len(data.get('code_suggestions', [])) == 0: + pr_body += "No suggestions found to improve this PR." + return pr_body + + if get_settings().pr_code_suggestions.enable_intro_text and get_settings().config.is_auto_command: + pr_body += "Explore these optional code suggestions:\n\n" + + language_extension_map_org = get_settings().language_extension_map_org + extension_to_language = {} + for language, extensions in language_extension_map_org.items(): + for ext in extensions: + extension_to_language[ext] = language + + pr_body += "<table>" + header = f"Suggestion" + delta = 66 + header += "  " * delta + if get_settings().pr_code_suggestions.self_reflect_on_suggestions: + pr_body += f"""<thead><tr><td>Category</td><td align=left>{header}</td><td align=center>Score</td></tr>""" + else: + pr_body += f"""<thead><tr><td>Category</td><td align=left>{header}</td></tr>""" + pr_body += """<tbody>""" + suggestions_labels = dict() + # add all suggestions related to each label + for suggestion in data['code_suggestions']: + label = suggestion['label'].strip().strip("'").strip('"') + if label not in suggestions_labels: + suggestions_labels[label] = [] + suggestions_labels[label].append(suggestion) + + # sort suggestions_labels by the suggestion with the highest score + if get_settings().pr_code_suggestions.self_reflect_on_suggestions: + suggestions_labels = dict( + sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True)) + # sort the suggestions inside each label group by score + for label, suggestions in suggestions_labels.items(): + suggestions_labels[label] = sorted(suggestions, key=lambda x: x['score'], reverse=True) + + counter_suggestions = 0 + for label, suggestions in suggestions_labels.items(): + num_suggestions = len(suggestions) + pr_body += f"""<tr><td rowspan={num_suggestions}><strong>{label.capitalize()}</strong></td>\n""" + for i, suggestion in enumerate(suggestions): + + relevant_file = suggestion['relevant_file'].strip() + relevant_lines_start = int(suggestion['relevant_lines_start']) + relevant_lines_end = int(suggestion['relevant_lines_end']) + range_str = "" + if relevant_lines_start == relevant_lines_end: + range_str = f"[{relevant_lines_start}]" + else: + range_str = f"[{relevant_lines_start}-{relevant_lines_end}]" + + try: + code_snippet_link = self.git_provider.get_line_link(relevant_file, relevant_lines_start, + relevant_lines_end) + except: + code_snippet_link = "" + # add html table for each suggestion + + suggestion_content = suggestion['suggestion_content'].rstrip() + CHAR_LIMIT_PER_LINE = 84 + suggestion_content = insert_br_after_x_chars(suggestion_content, CHAR_LIMIT_PER_LINE) + # pr_body += f"<tr><td><details><summary>{suggestion_content}</summary>" + existing_code = suggestion['existing_code'].rstrip() + "\n" + improved_code = suggestion['improved_code'].rstrip() + "\n" + + diff = difflib.unified_diff(existing_code.split('\n'), + improved_code.split('\n'), n=999) + patch_orig = "\n".join(diff) + patch = "\n".join(patch_orig.splitlines()[5:]).strip('\n') + + example_code = "" + example_code += f"```diff\n{patch.rstrip()}\n```\n" + if i == 0: + pr_body += f"""<td>\n\n""" + else: + pr_body += f"""<tr><td>\n\n""" + suggestion_summary = suggestion['one_sentence_summary'].strip().rstrip('.') + if "'<" in suggestion_summary and ">'" in suggestion_summary: + # escape the '<' and '>' characters, otherwise they are interpreted as html tags + get_logger().info(f"Escaped suggestion summary: {suggestion_summary}") + suggestion_summary = suggestion_summary.replace("'<", "`<") + suggestion_summary = suggestion_summary.replace(">'", ">`") + if '`' in suggestion_summary: + suggestion_summary = replace_code_tags(suggestion_summary) + + pr_body += f"""\n\n<details><summary>{suggestion_summary}</summary>\n\n___\n\n""" + pr_body += f""" +**{suggestion_content}** + +[{relevant_file} {range_str}]({code_snippet_link}) + +{example_code.rstrip()} +""" + if get_settings().pr_code_suggestions.self_reflect_on_suggestions: + pr_body += f"<details><summary>Suggestion importance[1-10]: {suggestion['score']}</summary>\n\n" + pr_body += f"Why: {suggestion['score_why']}\n\n" + pr_body += f"</details>" + + pr_body += f"</details>" + + # # add another column for 'score' + if get_settings().pr_code_suggestions.self_reflect_on_suggestions: + pr_body += f"</td><td align=center>{suggestion['score']}\n\n" + + pr_body += f"</td></tr>" + counter_suggestions += 1 + + # pr_body += "</details>" + # pr_body += """</td></tr>""" + pr_body += """</tr></tbody></table>""" + return pr_body + except Exception as e: + get_logger().info(f"Failed to publish summarized code suggestions, error: {e}") + return "" + + async def self_reflect_on_suggestions(self, suggestion_list: List, patches_diff: str, model: str) -> str: + if not suggestion_list: + return "" + + try: + suggestion_str = "" + for i, suggestion in enumerate(suggestion_list): + suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' + + variables = {'suggestion_list': suggestion_list, + 'suggestion_str': suggestion_str, + "diff": patches_diff, + 'num_code_suggestions': len(suggestion_list), + "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False)} + environment = Environment(undefined=StrictUndefined) + system_prompt_reflect = environment.from_string( + get_settings().pr_code_suggestions_reflect_prompt.system).render( + variables) + user_prompt_reflect = environment.from_string( + get_settings().pr_code_suggestions_reflect_prompt.user).render(variables) + with get_logger().contextualize(command="self_reflect_on_suggestions"): + response_reflect, finish_reason_reflect = await self.ai_handler.chat_completion(model=model, + system=system_prompt_reflect, + user=user_prompt_reflect) + except Exception as e: + get_logger().info(f"Could not reflect on suggestions, error: {e}") + return "" + return response_reflect + diff --git a/pr_agent/tools/pr_config.py b/pr_agent/tools/pr_config.py index 0dc359188..05750f2d1 100644 --- a/pr_agent/tools/pr_config.py +++ b/pr_agent/tools/pr_config.py @@ -1,14 +1,15 @@ -import logging +from dynaconf import Dynaconf from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider +from pr_agent.log import get_logger class PRConfig: """ The PRConfig class is responsible for listing all configuration options available for the user. """ - def __init__(self, pr_url: str, args=None): + def __init__(self, pr_url: str, args=None, ai_handler=None): """ Initialize the PRConfig object with the necessary attributes and objects to comment on a pull request. @@ -19,30 +20,43 @@ def __init__(self, pr_url: str, args=None): self.git_provider = get_git_provider()(pr_url) async def run(self): - logging.info('Getting configuration settings...') - logging.info('Preparing configs...') + get_logger().info('Getting configuration settings...') + get_logger().info('Preparing configs...') pr_comment = self._prepare_pr_configs() if get_settings().config.publish_output: - logging.info('Pushing configs...') + get_logger().info('Pushing configs...') self.git_provider.publish_comment(pr_comment) self.git_provider.remove_initial_comment() return "" def _prepare_pr_configs(self) -> str: - import tomli - with open(get_settings().find_file("configuration.toml"), "rb") as conf_file: - configuration_headers = [header.lower() for header in tomli.load(conf_file).keys()] + conf_file = get_settings().find_file("configuration.toml") + conf_settings = Dynaconf(settings_files=[conf_file]) + configuration_headers = [header.lower() for header in conf_settings.keys()] relevant_configs = { header: configs for header, configs in get_settings().to_dict().items() - if header.lower().startswith("pr_") and header.lower() in configuration_headers + if (header.lower().startswith("pr_") or header.lower().startswith("config")) and header.lower() in configuration_headers } - comment_str = "Possible Configurations:" + + skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", + 'trial_prefix_message', 'no_eligible_message', 'identity_provider', 'ALLOWED_REPOS', + 'APP_NAME'] + extra_skip_keys = get_settings().config.get('config.skip_keys', []) + if extra_skip_keys: + skip_keys.extend(extra_skip_keys) + + markdown_text = "<details> <summary><strong>๐Ÿ› ๏ธ PR-Agent Configurations:</strong></summary> \n\n" + markdown_text += f"\n\n```yaml\n\n" for header, configs in relevant_configs.items(): if configs: - comment_str += "\n" + markdown_text += "\n\n" + markdown_text += f"==================== {header} ====================" for key, value in configs.items(): - comment_str += f"\n{header.lower()}.{key.lower()} = {repr(value) if isinstance(value, str) else value}" - comment_str += " " - if get_settings().config.verbosity_level >= 2: - logging.info(f"comment_str:\n{comment_str}") - return comment_str + if key in skip_keys: + continue + markdown_text += f"\n{header.lower()}.{key.lower()} = {repr(value) if isinstance(value, str) else value}" + markdown_text += " " + markdown_text += "\n```" + markdown_text += "\n</details>\n" + get_logger().info(f"Possible Configurations outputted to PR comment", artifact=markdown_text) + return markdown_text diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index d55dd55a3..9f7d79d31 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -1,21 +1,30 @@ +import asyncio import copy -import json -import logging +import re +from functools import partial from typing import List, Tuple +import yaml from jinja2 import Environment, StrictUndefined -from pr_agent.algo.ai_handler import AiHandler -from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, get_pr_diff_multiple_patchs, \ + OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD from pr_agent.algo.token_handler import TokenHandler -from pr_agent.algo.utils import load_yaml +from pr_agent.algo.utils import set_custom_labels +from pr_agent.algo.utils import load_yaml, get_user_labels, ModelType, show_relevant_configurations, get_max_tokens, \ + clip_tokens from pr_agent.config_loader import get_settings -from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers import get_git_provider, GithubProvider, get_git_provider_with_context from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger +from pr_agent.servers.help import HelpMessage class PRDescription: - def __init__(self, pr_url: str, args: list = None): + def __init__(self, pr_url: str, args: list = None, + ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): """ Initialize the PRDescription object with the necessary attributes and objects for generating a PR description using an AI model. @@ -24,25 +33,37 @@ def __init__(self, pr_url: str, args: list = None): args (list, optional): List of arguments passed to the PRDescription class. Defaults to None. """ # Initialize the git provider and main PR language - self.git_provider = get_git_provider()(pr_url) + self.git_provider = get_git_provider_with_context(pr_url) self.main_pr_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) + self.pr_id = self.git_provider.get_pr_id() + + if get_settings().pr_description.enable_semantic_files_types and not self.git_provider.is_supported( + "gfm_markdown"): + get_logger().debug(f"Disabling semantic files types for {self.pr_id}, gfm_markdown not supported.") + get_settings().pr_description.enable_semantic_files_types = False # Initialize the AI handler - self.ai_handler = AiHandler() - + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_pr_language + # Initialize the variables dictionary self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), - "description": self.git_provider.get_pr_description(), + "description": self.git_provider.get_pr_description(full=False), "language": self.main_pr_language, "diff": "", # empty diff for initial calculation "extra_instructions": get_settings().pr_description.extra_instructions, - "commit_messages_str": self.git_provider.get_commit_messages() + "commit_messages_str": self.git_provider.get_commit_messages(), + "enable_custom_labels": get_settings().config.enable_custom_labels, + "custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function + "enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types, } - + + self.user_description = self.git_provider.get_user_description() + # Initialize the token handler self.token_handler = TokenHandler( self.git_provider.pr, @@ -50,138 +71,623 @@ def __init__(self, pr_url: str, args: list = None): get_settings().pr_description_prompt.system, get_settings().pr_description_prompt.user, ) - + # Initialize patches_diff and prediction attributes self.patches_diff = None self.prediction = None + self.file_label_dict = None + self.COLLAPSIBLE_FILE_LIST_THRESHOLD = 8 async def run(self): - """ - Generates a PR description using an AI model and publishes it to the PR. - """ - logging.info('Generating a PR description...') - if get_settings().config.publish_output: - self.git_provider.publish_comment("Preparing pr description...", is_temporary=True) - - await retry_with_fallback_models(self._prepare_prediction) - - logging.info('Preparing answer...') - pr_title, pr_body, pr_types, markdown_text = self._prepare_pr_answer() - - if get_settings().config.publish_output: - logging.info('Pushing answer...') - if get_settings().pr_description.publish_description_as_comment: - self.git_provider.publish_comment(markdown_text) + try: + get_logger().info(f"Generating a PR description for pr_id: {self.pr_id}") + relevant_configs = {'pr_description': dict(get_settings().pr_description), + 'config': dict(get_settings().config)} + get_logger().debug("Relevant configs", artifacts=relevant_configs) + if get_settings().config.publish_output and not get_settings().config.get('is_auto_command', False): + self.git_provider.publish_comment("Preparing PR description...", is_temporary=True) + + await retry_with_fallback_models(self._prepare_prediction, ModelType.TURBO) + + if self.prediction: + self._prepare_data() + else: + get_logger().warning(f"Empty prediction, PR: {self.pr_id}") + self.git_provider.remove_initial_comment() + return None + + if get_settings().pr_description.enable_semantic_files_types: + self.file_label_dict = self._prepare_file_labels() + + pr_labels, pr_file_changes = [], [] + if get_settings().pr_description.publish_labels: + pr_labels = self._prepare_labels() + + if get_settings().pr_description.use_description_markers: + pr_title, pr_body, changes_walkthrough, pr_file_changes = self._prepare_pr_answer_with_markers() else: - self.git_provider.publish_description(pr_title, pr_body) - if self.git_provider.is_supported("get_labels"): - current_labels = self.git_provider.get_labels() - if current_labels is None: - current_labels = [] - self.git_provider.publish_labels(pr_types + current_labels) - self.git_provider.remove_initial_comment() - + pr_title, pr_body, changes_walkthrough, pr_file_changes = self._prepare_pr_answer() + if not self.git_provider.is_supported( + "publish_file_comments") or not get_settings().pr_description.inline_file_summary: + pr_body += "\n\n" + changes_walkthrough + get_logger().debug("PR output", artifact={"title": pr_title, "body": pr_body}) + + # Add help text if gfm_markdown is supported + if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_description.enable_help_text: + pr_body += "<hr>\n\n<details> <summary><strong>โœจ Describe tool usage guide:</strong></summary><hr> \n\n" + pr_body += HelpMessage.get_describe_usage_guide() + pr_body += "\n</details>\n" + elif get_settings().pr_description.enable_help_comment: + pr_body += '\n\n___\n\n> ๐Ÿ’ก **PR-Agent usage**: Comment `/help "your question"` on any pull request to receive relevant information' + + # Output the relevant configurations if enabled + if get_settings().get('config', {}).get('output_relevant_configurations', False): + pr_body += show_relevant_configurations(relevant_section='pr_description') + + if get_settings().config.publish_output: + # publish labels + if get_settings().pr_description.publish_labels and self.git_provider.is_supported("get_labels"): + original_labels = self.git_provider.get_pr_labels(update=True) + get_logger().debug(f"original labels", artifact=original_labels) + user_labels = get_user_labels(original_labels) + new_labels = pr_labels + user_labels + get_logger().debug(f"published labels", artifact=new_labels) + if sorted(new_labels) != sorted(original_labels): + self.git_provider.publish_labels(new_labels) + else: + get_logger().debug(f"Labels are the same, not updating") + + # publish description + if get_settings().pr_description.publish_description_as_comment: + full_markdown_description = f"## Title\n\n{pr_title}\n\n___\n{pr_body}" + if get_settings().pr_description.publish_description_as_comment_persistent: + self.git_provider.publish_persistent_comment(full_markdown_description, + initial_header="## Title", + update_header=True, + name="describe", + final_update_message=False, ) + else: + self.git_provider.publish_comment(full_markdown_description) + else: + self.git_provider.publish_description(pr_title, pr_body) + + # publish final update message + if (get_settings().pr_description.final_update_message): + latest_commit_url = self.git_provider.get_latest_commit_url() + if latest_commit_url: + pr_url = self.git_provider.get_pr_url() + update_comment = f"**[PR Description]({pr_url})** updated to latest commit ({latest_commit_url})" + self.git_provider.publish_comment(update_comment) + self.git_provider.remove_initial_comment() + except Exception as e: + get_logger().error(f"Error generating PR description {self.pr_id}: {e}") + return "" async def _prepare_prediction(self, model: str) -> None: - """ - Prepare the AI prediction for the PR description based on the provided model. + if get_settings().pr_description.use_description_markers and 'pr_agent:' not in self.user_description: + get_logger().info( + "Markers were enabled, but user description does not contain markers. skipping AI prediction") + return None - Args: - model (str): The name of the model to be used for generating the prediction. + large_pr_handling = get_settings().pr_description.enable_large_pr_handling and "pr_description_only_files_prompts" in get_settings() + output = get_pr_diff(self.git_provider, self.token_handler, model, large_pr_handling=large_pr_handling, + return_remaining_files=True) + if isinstance(output, tuple): + patches_diff, remaining_files_list = output + else: + patches_diff = output + remaining_files_list = [] + if not large_pr_handling or patches_diff: + self.patches_diff = patches_diff + if patches_diff: + get_logger().debug(f"PR diff", artifact=self.patches_diff) + self.prediction = await self._get_prediction(model, patches_diff, prompt="pr_description_prompt") + if (remaining_files_list and 'pr_files' in self.prediction and 'label:' in self.prediction and + get_settings().pr_description.mention_extra_files): + get_logger().debug(f"Extending additional files, {len(remaining_files_list)} files") + self.prediction = await self.extend_additional_files(remaining_files_list) + else: + get_logger().error(f"Error getting PR diff {self.pr_id}") + self.prediction = None + else: + # get the diff in multiple patches, with the token handler only for the files prompt + get_logger().debug('large_pr_handling for describe') + token_handler_only_files_prompt = TokenHandler( + self.git_provider.pr, + self.vars, + get_settings().pr_description_only_files_prompts.system, + get_settings().pr_description_only_files_prompts.user, + ) + (patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, + files_in_patches_list) = get_pr_diff_multiple_patchs( + self.git_provider, token_handler_only_files_prompt, model) - Returns: - None + # get the files prediction for each patch + if not get_settings().pr_description.async_ai_calls: + results = [] + for i, patches in enumerate(patches_compressed_list): # sync calls + patches_diff = "\n".join(patches) + get_logger().debug(f"PR diff number {i + 1} for describe files") + prediction_files = await self._get_prediction(model, patches_diff, + prompt="pr_description_only_files_prompts") + results.append(prediction_files) + else: # async calls + tasks = [] + for i, patches in enumerate(patches_compressed_list): + if patches: + patches_diff = "\n".join(patches) + get_logger().debug(f"PR diff number {i + 1} for describe files") + task = asyncio.create_task( + self._get_prediction(model, patches_diff, prompt="pr_description_only_files_prompts")) + tasks.append(task) + # Wait for all tasks to complete + results = await asyncio.gather(*tasks) + file_description_str_list = [] + for i, result in enumerate(results): + prediction_files = result.strip().removeprefix('```yaml').strip('`').strip() + if load_yaml(prediction_files) and prediction_files.startswith('pr_files'): + prediction_files = prediction_files.removeprefix('pr_files:').strip() + file_description_str_list.append(prediction_files) + else: + get_logger().debug(f"failed to generate predictions in iteration {i + 1} for describe files") - Raises: - Any exceptions raised by the 'get_pr_diff' and '_get_prediction' functions. + # generate files_walkthrough string, with proper token handling + token_handler_only_description_prompt = TokenHandler( + self.git_provider.pr, + self.vars, + get_settings().pr_description_only_description_prompts.system, + get_settings().pr_description_only_description_prompts.user) + files_walkthrough = "\n".join(file_description_str_list) + files_walkthrough_prompt = copy.deepcopy(files_walkthrough) + MAX_EXTRA_FILES_TO_PROMPT = 50 + if remaining_files_list: + files_walkthrough_prompt += "\n\nNo more token budget. Additional unprocessed files:" + for i, file in enumerate(remaining_files_list): + files_walkthrough_prompt += f"\n- {file}" + if i >= MAX_EXTRA_FILES_TO_PROMPT: + get_logger().debug(f"Too many remaining files, clipping to {MAX_EXTRA_FILES_TO_PROMPT}") + files_walkthrough_prompt += f"\n... and {len(remaining_files_list) - MAX_EXTRA_FILES_TO_PROMPT} more" + break + if deleted_files_list: + files_walkthrough_prompt += "\n\nAdditional deleted files:" + for i, file in enumerate(deleted_files_list): + files_walkthrough_prompt += f"\n- {file}" + if i >= MAX_EXTRA_FILES_TO_PROMPT: + get_logger().debug(f"Too many deleted files, clipping to {MAX_EXTRA_FILES_TO_PROMPT}") + files_walkthrough_prompt += f"\n... and {len(deleted_files_list) - MAX_EXTRA_FILES_TO_PROMPT} more" + break + tokens_files_walkthrough = len( + token_handler_only_description_prompt.encoder.encode(files_walkthrough_prompt)) + total_tokens = token_handler_only_description_prompt.prompt_tokens + tokens_files_walkthrough + max_tokens_model = get_max_tokens(model) + if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: + # clip files_walkthrough to git the tokens within the limit + files_walkthrough_prompt = clip_tokens(files_walkthrough_prompt, + max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD - token_handler_only_description_prompt.prompt_tokens, + num_input_tokens=tokens_files_walkthrough) - """ - logging.info('Getting PR diff...') - self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) - logging.info('Getting AI prediction...') - self.prediction = await self._get_prediction(model) + # PR header inference + get_logger().debug(f"PR diff only description", artifact=files_walkthrough_prompt) + prediction_headers = await self._get_prediction(model, patches_diff=files_walkthrough_prompt, + prompt="pr_description_only_description_prompts") + prediction_headers = prediction_headers.strip().removeprefix('```yaml').strip('`').strip() - async def _get_prediction(self, model: str) -> str: - """ - Generate an AI prediction for the PR description based on the provided model. + # manually add extra files to final prediction + MAX_EXTRA_FILES_TO_OUTPUT = 100 + if get_settings().pr_description.mention_extra_files: + for i, file in enumerate(remaining_files_list): + extra_file_yaml = f"""\ +- filename: | + {file} + changes_summary: | + ... + changes_title: | + ... + label: | + additional files (token-limit) +""" + files_walkthrough = files_walkthrough.strip() + "\n" + extra_file_yaml.strip() + if i >= MAX_EXTRA_FILES_TO_OUTPUT: + files_walkthrough += f"""\ +extra_file_yaml = +- filename: | + Additional {len(remaining_files_list) - MAX_EXTRA_FILES_TO_OUTPUT} files not shown + changes_summary: | + ... + changes_title: | + ... + label: | + additional files (token-limit) +""" + break - Args: - model (str): The name of the model to be used for generating the prediction. + # final processing + self.prediction = prediction_headers + "\n" + "pr_files:\n" + files_walkthrough + if not load_yaml(self.prediction): + get_logger().error(f"Error getting valid YAML in large PR handling for describe {self.pr_id}") + if load_yaml(prediction_headers): + get_logger().debug(f"Using only headers for describe {self.pr_id}") + self.prediction = prediction_headers - Returns: - str: The generated AI prediction. - """ + async def extend_additional_files(self, remaining_files_list) -> str: + prediction = self.prediction + try: + original_prediction_dict = load_yaml(self.prediction) + prediction_extra = "pr_files:" + for file in remaining_files_list: + extra_file_yaml = f"""\ +- filename: | + {file} + changes_summary: | + ... + changes_title: | + ... + label: | + additional files (token-limit) +""" + prediction_extra = prediction_extra + "\n" + extra_file_yaml.strip() + prediction_extra_dict = load_yaml(prediction_extra) + # merge the two dictionaries + if isinstance(original_prediction_dict, dict) and isinstance(prediction_extra_dict, dict): + original_prediction_dict["pr_files"].extend(prediction_extra_dict["pr_files"]) + new_yaml = yaml.dump(original_prediction_dict) + if load_yaml(new_yaml): + prediction = new_yaml + return prediction + except Exception as e: + get_logger().error(f"Error extending additional files {self.pr_id}: {e}") + return self.prediction + + async def _get_prediction(self, model: str, patches_diff: str, prompt="pr_description_prompt") -> str: variables = copy.deepcopy(self.vars) - variables["diff"] = self.patches_diff # update diff + variables["diff"] = patches_diff # update diff environment = Environment(undefined=StrictUndefined) - system_prompt = environment.from_string(get_settings().pr_description_prompt.system).render(variables) - user_prompt = environment.from_string(get_settings().pr_description_prompt.user).render(variables) + set_custom_labels(variables, self.git_provider) + self.variables = variables - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") + system_prompt = environment.from_string(get_settings().get(prompt, {}).get("system", "")).render(self.variables) + user_prompt = environment.from_string(get_settings().get(prompt, {}).get("user", "")).render(self.variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, - temperature=0.2, + temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt ) return response - def _prepare_pr_answer(self) -> Tuple[str, str, List[str], str]: + def _prepare_data(self): + # Load the AI prediction data into a dictionary + self.data = load_yaml(self.prediction.strip()) + + if get_settings().pr_description.add_original_user_description and self.user_description: + self.data["User Description"] = self.user_description + + # re-order keys + if 'User Description' in self.data: + self.data['User Description'] = self.data.pop('User Description') + if 'title' in self.data: + self.data['title'] = self.data.pop('title') + if 'type' in self.data: + self.data['type'] = self.data.pop('type') + if 'labels' in self.data: + self.data['labels'] = self.data.pop('labels') + if 'description' in self.data: + self.data['description'] = self.data.pop('description') + if 'pr_files' in self.data: + self.data['pr_files'] = self.data.pop('pr_files') + + def _prepare_labels(self) -> List[str]: + pr_types = [] + + # If the 'PR Type' key is present in the dictionary, split its value by comma and assign it to 'pr_types' + if 'labels' in self.data: + if type(self.data['labels']) == list: + pr_types = self.data['labels'] + elif type(self.data['labels']) == str: + pr_types = self.data['labels'].split(',') + elif 'type' in self.data: + if type(self.data['type']) == list: + pr_types = self.data['type'] + elif type(self.data['type']) == str: + pr_types = self.data['type'].split(',') + pr_types = [label.strip() for label in pr_types] + + # convert lowercase labels to original case + try: + if "labels_minimal_to_labels_dict" in self.variables: + d: dict = self.variables["labels_minimal_to_labels_dict"] + for i, label_i in enumerate(pr_types): + if label_i in d: + pr_types[i] = d[label_i] + except Exception as e: + get_logger().error(f"Error converting labels to original case {self.pr_id}: {e}") + return pr_types + + def _prepare_pr_answer_with_markers(self) -> Tuple[str, str, str, List[dict]]: + get_logger().info(f"Using description marker replacements {self.pr_id}") + title = self.vars["title"] + body = self.user_description + if get_settings().pr_description.include_generated_by_header: + ai_header = f"### ๐Ÿค– Generated by PR Agent at {self.git_provider.last_commit_id.sha}\n\n" + else: + ai_header = "" + + ai_type = self.data.get('type') + if ai_type and not re.search(r'<!--\s*pr_agent:type\s*-->', body): + pr_type = f"{ai_header}{ai_type}" + body = body.replace('pr_agent:type', pr_type) + + ai_summary = self.data.get('description') + if ai_summary and not re.search(r'<!--\s*pr_agent:summary\s*-->', body): + summary = f"{ai_header}{ai_summary}" + body = body.replace('pr_agent:summary', summary) + + ai_walkthrough = self.data.get('pr_files') + walkthrough_gfm = "" + pr_file_changes = [] + if ai_walkthrough and not re.search(r'<!--\s*pr_agent:walkthrough\s*-->', body): + try: + walkthrough_gfm, pr_file_changes = self.process_pr_files_prediction(walkthrough_gfm, + self.file_label_dict) + body = body.replace('pr_agent:walkthrough', walkthrough_gfm) + except Exception as e: + get_logger().error(f"Failing to process walkthrough {self.pr_id}: {e}") + body = body.replace('pr_agent:walkthrough', "") + + return title, body, walkthrough_gfm, pr_file_changes + + def _prepare_pr_answer(self) -> Tuple[str, str, str, List[dict]]: """ Prepare the PR description based on the AI prediction data. Returns: - title: a string containing the PR title. - - pr_body: a string containing the PR body in a markdown format. - - pr_types: a list of strings containing the PR types. - - markdown_text: a string containing the AI prediction data in a markdown format. used for publishing a comment + - pr_body: a string containing the PR description body in a markdown format. """ - # Load the AI prediction data into a dictionary - data = load_yaml(self.prediction.strip()) - - # Initialization - pr_types = [] # Iterate over the dictionary items and append the key and value to 'markdown_text' in a markdown format markdown_text = "" - for key, value in data.items(): - markdown_text += f"## {key}\n\n" + # Don't display 'PR Labels' + if 'labels' in self.data and self.git_provider.is_supported("get_labels"): + self.data.pop('labels') + if not get_settings().pr_description.enable_pr_type: + self.data.pop('type') + for key, value in self.data.items(): + markdown_text += f"## **{key}**\n\n" markdown_text += f"{value}\n\n" - # If the 'PR Type' key is present in the dictionary, split its value by comma and assign it to 'pr_types' - if 'PR Type' in data: - if type(data['PR Type']) == list: - pr_types = data['PR Type'] - elif type(data['PR Type']) == str: - pr_types = data['PR Type'].split(',') - - # Assign the value of the 'PR Title' key to 'title' variable and remove it from the dictionary - title = data.pop('PR Title') + # Remove the 'PR Title' key from the dictionary + ai_title = self.data.pop('title', self.vars["title"]) + if (not get_settings().pr_description.generate_ai_title): + # Assign the original PR title to the 'title' variable + title = self.vars["title"] + else: + # Assign the value of the 'PR Title' key to 'title' variable + title = ai_title # Iterate over the remaining dictionary items and append the key and value to 'pr_body' in a markdown format, # except for the items containing the word 'walkthrough' - pr_body = "" - for key, value in data.items(): - pr_body += f"## {key}:\n" + pr_body, changes_walkthrough = "", "" + pr_file_changes = [] + for idx, (key, value) in enumerate(self.data.items()): + if key == 'pr_files': + value = self.file_label_dict + else: + key_publish = key.rstrip(':').replace("_", " ").capitalize() + if key_publish == "Type": + key_publish = "PR Type" + # elif key_publish == "Description": + # key_publish = "PR Description" + pr_body += f"### **{key_publish}**\n" if 'walkthrough' in key.lower(): - # for filename, description in value.items(): + if self.git_provider.is_supported("gfm_markdown"): + pr_body += "<details> <summary>files:</summary>\n\n" for file in value: filename = file['filename'].replace("'", "`") - description = file['changes in file'] - pr_body += f'`{filename}`: {description}\n' + description = file['changes_in_file'] + pr_body += f'- `{filename}`: {description}\n' + if self.git_provider.is_supported("gfm_markdown"): + pr_body += "</details>\n" + elif 'pr_files' in key.lower() and get_settings().pr_description.enable_semantic_files_types: + changes_walkthrough, pr_file_changes = self.process_pr_files_prediction(changes_walkthrough, value) + changes_walkthrough = f"### **Changes walkthrough** ๐Ÿ“\n{changes_walkthrough}" else: # if the value is a list, join its items by comma - if type(value) == list: - value = ', '.join(v for v in value) - pr_body += f"{value}\n\n___\n" + if isinstance(value, list): + value = ', '.join(v.rstrip() for v in value) + pr_body += f"{value}\n" + if idx < len(self.data) - 1: + pr_body += "\n\n___\n\n" + + return title, pr_body, changes_walkthrough, pr_file_changes, + + def _prepare_file_labels(self): + file_label_dict = {} + if (not self.data or not isinstance(self.data, dict) or + 'pr_files' not in self.data or not self.data['pr_files']): + return file_label_dict + for file in self.data['pr_files']: + try: + required_fields = ['changes_summary', 'changes_title', 'filename', 'label'] + if not all(field in file for field in required_fields): + # can happen for example if a YAML generation was interrupted in the middle (no more tokens) + get_logger().warning(f"Missing required fields in file label dict {self.pr_id}, skipping file", + artifact={"file": file}) + continue + filename = file['filename'].replace("'", "`").replace('"', '`') + changes_summary = file['changes_summary'] + changes_title = file['changes_title'].strip() + label = file.get('label').strip().lower() + if label not in file_label_dict: + file_label_dict[label] = [] + file_label_dict[label].append((filename, changes_title, changes_summary)) + except Exception as e: + get_logger().error(f"Error preparing file label dict {self.pr_id}: {e}") + pass + return file_label_dict + + def process_pr_files_prediction(self, pr_body, value): + pr_comments = [] + # logic for using collapsible file list + use_collapsible_file_list = get_settings().pr_description.collapsible_file_list + num_files = 0 + if value: + for semantic_label in value.keys(): + num_files += len(value[semantic_label]) + if use_collapsible_file_list == "adaptive": + use_collapsible_file_list = num_files > self.COLLAPSIBLE_FILE_LIST_THRESHOLD + + if not self.git_provider.is_supported("gfm_markdown"): + return pr_body, pr_comments + try: + pr_body += "<table>" + header = f"Relevant files" + delta = 75 + # header += "  " * delta + pr_body += f"""<thead><tr><th></th><th align="left">{header}</th></tr></thead>""" + pr_body += """<tbody>""" + for semantic_label in value.keys(): + s_label = semantic_label.strip("'").strip('"') + pr_body += f"""<tr><td><strong>{s_label.capitalize()}</strong></td>""" + list_tuples = value[semantic_label] + + if use_collapsible_file_list: + pr_body += f"""<td><details><summary>{len(list_tuples)} files</summary><table>""" + else: + pr_body += f"""<td><table>""" + for filename, file_changes_title, file_change_description in list_tuples: + filename = filename.replace("'", "`").rstrip() + filename_publish = filename.split("/")[-1] + + file_changes_title_code = f"<code>{file_changes_title}</code>" + file_changes_title_code_br = insert_br_after_x_chars(file_changes_title_code, x=(delta - 5)).strip() + if len(file_changes_title_code_br) < (delta - 5): + file_changes_title_code_br += "  " * ((delta - 5) - len(file_changes_title_code_br)) + filename_publish = f"<strong>{filename_publish}</strong><dd>{file_changes_title_code_br}</dd>" + diff_plus_minus = "" + delta_nbsp = "" + diff_files = self.git_provider.get_diff_files() + for f in diff_files: + if f.filename.lower().strip('/') == filename.lower().strip('/'): + num_plus_lines = f.num_plus_lines + num_minus_lines = f.num_minus_lines + diff_plus_minus += f"+{num_plus_lines}/-{num_minus_lines}" + delta_nbsp = "  " * max(0, (8 - len(diff_plus_minus))) + break + + # try to add line numbers link to code suggestions + link = "" + if hasattr(self.git_provider, 'get_line_link'): + filename = filename.strip() + link = self.git_provider.get_line_link(filename, relevant_line_start=-1) + + file_change_description_br = insert_br_after_x_chars(file_change_description, x=(delta - 5)) + pr_body += f""" +<tr> + <td> + <details> + <summary>{filename_publish}</summary> +<hr> + +{filename} + +{file_change_description_br} + + +</details> + + + </td> + <td><a href="{link}">{diff_plus_minus}</a>{delta_nbsp}</td> + +</tr> +""" + if use_collapsible_file_list: + pr_body += """</table></details></td></tr>""" + else: + pr_body += """</table></td></tr>""" + pr_body += """</tr></tbody></table>""" + + except Exception as e: + get_logger().error(f"Error processing pr files to markdown {self.pr_id}: {e}") + pass + return pr_body, pr_comments + + +def count_chars_without_html(string): + if '<' not in string: + return len(string) + no_html_string = re.sub('<[^>]+>', '', string) + return len(no_html_string) + + +def insert_br_after_x_chars(text, x=70): + """ + Insert <br> into a string after a word that increases its length above x characters. + Use proper HTML tags for code and new lines. + """ + if count_chars_without_html(text) < x: + return text + + # replace odd instances of ` with <code> and even instances of ` with </code> + text = replace_code_tags(text) + + # convert list items to <li> + if text.startswith("- ") or text.startswith("* "): + text = "<li>" + text[2:] + text = text.replace("\n- ", '<br><li> ').replace("\n - ", '<br><li> ') + text = text.replace("\n* ", '<br><li> ').replace("\n * ", '<br><li> ') + + # convert new lines to <br> + text = text.replace("\n", '<br>') + + # split text into lines + lines = text.split('<br>') + words = [] + for i, line in enumerate(lines): + words += line.split(' ') + if i < len(lines) - 1: + words[-1] += "<br>" + + new_text = [] + is_inside_code = False + current_length = 0 + for word in words: + is_saved_word = False + if word == "<code>" or word == "</code>" or word == "<li>" or word == "<br>": + is_saved_word = True + + len_word = count_chars_without_html(word) + if not is_saved_word and (current_length + len_word > x): + if is_inside_code: + new_text.append("</code><br><code>") + else: + new_text.append("<br>") + current_length = 0 # Reset counter + new_text.append(word + " ") + + if not is_saved_word: + current_length += len_word + 1 # Add 1 for the space + + if word == "<li>" or word == "<br>": + current_length = 0 + + if "<code>" in word: + is_inside_code = True + if "</code>" in word: + is_inside_code = False + return ''.join(new_text).strip() - if get_settings().config.verbosity_level >= 2: - logging.info(f"title:\n{title}\n{pr_body}") - return title, pr_body, pr_types, markdown_text \ No newline at end of file +def replace_code_tags(text): + """ + Replace odd instances of ` with <code> and even instances of ` with </code> + """ + parts = text.split('`') + for i in range(1, len(parts), 2): + parts[i] = '<code>' + parts[i] + '</code>' + return ''.join(parts) diff --git a/pr_agent/tools/pr_generate_labels.py b/pr_agent/tools/pr_generate_labels.py new file mode 100644 index 000000000..1911e0cd9 --- /dev/null +++ b/pr_agent/tools/pr_generate_labels.py @@ -0,0 +1,180 @@ +import copy +import re +from functools import partial +from typing import List, Tuple + +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger + + +class PRGenerateLabels: + def __init__(self, pr_url: str, args: list = None, + ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): + """ + Initialize the PRGenerateLabels object with the necessary attributes and objects for generating labels + corresponding to the PR using an AI model. + Args: + pr_url (str): The URL of the pull request. + args (list, optional): List of arguments passed to the PRGenerateLabels class. Defaults to None. + """ + # Initialize the git provider and main PR language + self.git_provider = get_git_provider()(pr_url) + self.main_pr_language = get_main_pr_language( + self.git_provider.get_languages(), self.git_provider.get_files() + ) + self.pr_id = self.git_provider.get_pr_id() + + # Initialize the AI handler + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_pr_language + + # Initialize the variables dictionary + self.vars = { + "title": self.git_provider.pr.title, + "branch": self.git_provider.get_pr_branch(), + "description": self.git_provider.get_pr_description(full=False), + "language": self.main_pr_language, + "diff": "", # empty diff for initial calculation + "extra_instructions": get_settings().pr_description.extra_instructions, + "commit_messages_str": self.git_provider.get_commit_messages(), + "enable_custom_labels": get_settings().config.enable_custom_labels, + "custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function + } + + # Initialize the token handler + self.token_handler = TokenHandler( + self.git_provider.pr, + self.vars, + get_settings().pr_custom_labels_prompt.system, + get_settings().pr_custom_labels_prompt.user, + ) + + # Initialize patches_diff and prediction attributes + self.patches_diff = None + self.prediction = None + + async def run(self): + """ + Generates a PR labels using an AI model and publishes it to the PR. + """ + + try: + get_logger().info(f"Generating a PR labels {self.pr_id}") + if get_settings().config.publish_output: + self.git_provider.publish_comment("Preparing PR labels...", is_temporary=True) + + await retry_with_fallback_models(self._prepare_prediction) + + get_logger().info(f"Preparing answer {self.pr_id}") + if self.prediction: + self._prepare_data() + else: + return None + + pr_labels = self._prepare_labels() + + if get_settings().config.publish_output: + get_logger().info(f"Pushing labels {self.pr_id}") + + current_labels = self.git_provider.get_pr_labels() + user_labels = get_user_labels(current_labels) + pr_labels = pr_labels + user_labels + + if self.git_provider.is_supported("get_labels"): + self.git_provider.publish_labels(pr_labels) + elif pr_labels: + value = ', '.join(v for v in pr_labels) + pr_labels_text = f"## PR Labels:\n{value}\n" + self.git_provider.publish_comment(pr_labels_text, is_temporary=False) + self.git_provider.remove_initial_comment() + except Exception as e: + get_logger().error(f"Error generating PR labels {self.pr_id}: {e}") + + return "" + + async def _prepare_prediction(self, model: str) -> None: + """ + Prepare the AI prediction for the PR labels based on the provided model. + + Args: + model (str): The name of the model to be used for generating the prediction. + + Returns: + None + + Raises: + Any exceptions raised by the 'get_pr_diff' and '_get_prediction' functions. + + """ + + get_logger().info(f"Getting PR diff {self.pr_id}") + self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) + get_logger().info(f"Getting AI prediction {self.pr_id}") + self.prediction = await self._get_prediction(model) + + async def _get_prediction(self, model: str) -> str: + """ + Generate an AI prediction for the PR labels based on the provided model. + + Args: + model (str): The name of the model to be used for generating the prediction. + + Returns: + str: The generated AI prediction. + """ + variables = copy.deepcopy(self.vars) + variables["diff"] = self.patches_diff # update diff + + environment = Environment(undefined=StrictUndefined) + set_custom_labels(variables, self.git_provider) + self.variables = variables + + system_prompt = environment.from_string(get_settings().pr_custom_labels_prompt.system).render(self.variables) + user_prompt = environment.from_string(get_settings().pr_custom_labels_prompt.user).render(self.variables) + + response, finish_reason = await self.ai_handler.chat_completion( + model=model, + temperature=get_settings().config.temperature, + system=system_prompt, + user=user_prompt + ) + + return response + + def _prepare_data(self): + # Load the AI prediction data into a dictionary + self.data = load_yaml(self.prediction.strip()) + + + + def _prepare_labels(self) -> List[str]: + pr_types = [] + + # If the 'labels' key is present in the dictionary, split its value by comma and assign it to 'pr_types' + if 'labels' in self.data: + if type(self.data['labels']) == list: + pr_types = self.data['labels'] + elif type(self.data['labels']) == str: + pr_types = self.data['labels'].split(',') + pr_types = [label.strip() for label in pr_types] + + # convert lowercase labels to original case + try: + if "labels_minimal_to_labels_dict" in self.variables: + d: dict = self.variables["labels_minimal_to_labels_dict"] + for i, label_i in enumerate(pr_types): + if label_i in d: + pr_types[i] = d[label_i] + except Exception as e: + get_logger().error(f"Error converting labels to original case {self.pr_id}: {e}") + + return pr_types diff --git a/pr_agent/tools/pr_help_message.py b/pr_agent/tools/pr_help_message.py new file mode 100644 index 000000000..5c909ea68 --- /dev/null +++ b/pr_agent/tools/pr_help_message.py @@ -0,0 +1,360 @@ +import os +import traceback +import zipfile +import tempfile +import copy +from functools import partial + +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.pr_processing import retry_with_fallback_models +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import ModelType, load_yaml +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider, GithubProvider, BitbucketServerProvider, \ + get_git_provider_with_context +from pr_agent.log import get_logger + + +def extract_header(snippet): + res = '' + lines = snippet.split('===Snippet content===')[0].split('\n') + highest_header = '' + highest_level = float('inf') + for line in lines[::-1]: + line = line.strip() + if line.startswith('Header '): + highest_header = line.split(': ')[1] + if highest_header: + res = f"#{highest_header.lower().replace(' ', '-')}" + return res + +class PRHelpMessage: + def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): + self.git_provider = get_git_provider_with_context(pr_url) + self.ai_handler = ai_handler() + self.question_str = self.parse_args(args) + self.num_retrieved_snippets = get_settings().get('pr_help.num_retrieved_snippets', 5) + if self.question_str: + self.vars = { + "question": self.question_str, + "snippets": "", + } + self.token_handler = TokenHandler(None, + self.vars, + get_settings().pr_help_prompts.system, + get_settings().pr_help_prompts.user) + + async def _prepare_prediction(self, model: str): + try: + variables = copy.deepcopy(self.vars) + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_help_prompts.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_help_prompts.user).render(variables) + response, finish_reason = await self.ai_handler.chat_completion( + model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) + return response + except Exception as e: + get_logger().error(f"Error while preparing prediction: {e}") + return "" + + def parse_args(self, args): + if args and len(args) > 0: + question_str = " ".join(args) + else: + question_str = "" + return question_str + + def get_sim_results_from_s3_db(self, embeddings): + get_logger().info("Loading the S3 index...") + sim_results = [] + try: + from langchain_chroma import Chroma + from urllib import request + with tempfile.TemporaryDirectory() as temp_dir: + # Define the local file path within the temporary directory + local_file_path = os.path.join(temp_dir, 'chroma_db.zip') + + bucket = 'pr-agent' + file_name = 'chroma_db.zip' + s3_url = f'https://{bucket}.s3.amazonaws.com/{file_name}' + request.urlretrieve(s3_url, local_file_path) + + # # Download the file from S3 to the temporary directory + # s3 = boto3.client('s3') + # s3.download_file(bucket, file_name, local_file_path) + + # Extract the contents of the zip file + with zipfile.ZipFile(local_file_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db", + embedding_function=embeddings) + sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets) + except Exception as e: + get_logger().error(f"Error while getting sim from S3: {e}", + artifact={"traceback": traceback.format_exc()}) + return sim_results + + def get_sim_results_from_local_db(self, embeddings): + get_logger().info("Loading the local index...") + sim_results = [] + try: + from langchain_chroma import Chroma + get_logger().info("Loading the Chroma index...") + db_path = "./docs/chroma_db.zip" + if not os.path.exists(db_path): + db_path= "/app/docs/chroma_db.zip" + if not os.path.exists(db_path): + get_logger().error("Local db not found") + return sim_results + with tempfile.TemporaryDirectory() as temp_dir: + + # Extract the ZIP file + with zipfile.ZipFile(db_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db", + embedding_function=embeddings) + + # Do similarity search + sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets) + except Exception as e: + get_logger().error(f"Error while getting sim from local db: {e}", + artifact={"traceback": traceback.format_exc()}) + return sim_results + + def get_sim_results_from_pinecone_db(self, embeddings): + get_logger().info("Loading the Pinecone index...") + sim_results = [] + try: + from langchain_pinecone import PineconeVectorStore + INDEX_NAME = "pr-agent-docs" + vectorstore = PineconeVectorStore( + index_name=INDEX_NAME, embedding=embeddings, + pinecone_api_key=get_settings().pinecone.api_key + ) + + # Do similarity search + sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets) + except Exception as e: + get_logger().error(f"Error while getting sim from Pinecone db: {e}", + artifact={"traceback": traceback.format_exc()}) + return sim_results + + async def run(self): + try: + if self.question_str: + get_logger().info(f'Answering a PR question about the PR {self.git_provider.pr_url} ') + + if not get_settings().get('openai.key'): + if get_settings().config.publish_output: + self.git_provider.publish_comment( + "The `Help` tool chat feature requires an OpenAI API key for calculating embeddings") + else: + get_logger().error("The `Help` tool chat feature requires an OpenAI API key for calculating embeddings") + return + + # Initialize embeddings + from langchain_openai import OpenAIEmbeddings + embeddings = OpenAIEmbeddings(model="text-embedding-3-small", + api_key=get_settings().openai.key) + + # Get similar snippets via similarity search + if get_settings().pr_help.force_local_db: + sim_results = self.get_sim_results_from_local_db(embeddings) + elif get_settings().get('pinecone.api_key'): + sim_results = self.get_sim_results_from_pinecone_db(embeddings) + else: + sim_results = self.get_sim_results_from_s3_db(embeddings) + if not sim_results: + get_logger().info("Failed to load the S3 index. Loading the local index...") + sim_results = self.get_sim_results_from_local_db(embeddings) + if not sim_results: + get_logger().error("Failed to retrieve similar snippets. Exiting...") + return + + # Prepare relevant snippets + relevant_pages_full, relevant_snippets_full_header, relevant_snippets_str =\ + await self.prepare_relevant_snippets(sim_results) + self.vars['snippets'] = relevant_snippets_str.strip() + + # run the AI model + response = await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR) + response_yaml = load_yaml(response) + response_str = response_yaml.get('response') + relevant_snippets_numbers = response_yaml.get('relevant_snippets') + + if not relevant_snippets_numbers: + get_logger().info(f"Could not find relevant snippets for the question: {self.question_str}") + if get_settings().config.publish_output: + answer_str = f"### Question: \n{self.question_str}\n\n" + answer_str += f"### Answer:\n\n" + answer_str += f"Could not find relevant information to answer the question. Please provide more details and try again." + self.git_provider.publish_comment(answer_str) + return "" + + # prepare the answer + answer_str = "" + if response_str: + answer_str += f"### Question: \n{self.question_str}\n\n" + answer_str += f"### Answer:\n{response_str.strip()}\n\n" + answer_str += f"#### Relevant Sources:\n\n" + paged_published = [] + for page in relevant_snippets_numbers: + page = int(page - 1) + if page < len(relevant_pages_full) and page >= 0: + if relevant_pages_full[page] in paged_published: + continue + link = f"{relevant_pages_full[page]}{relevant_snippets_full_header[page]}" + # answer_str += f"> - [{relevant_pages_full[page]}]({link})\n" + answer_str += f"> - {link}\n" + paged_published.append(relevant_pages_full[page]) + + # publish the answer + if get_settings().config.publish_output: + self.git_provider.publish_comment(answer_str) + else: + get_logger().info(f"Answer:\n{answer_str}") + else: + if not isinstance(self.git_provider, BitbucketServerProvider) and not self.git_provider.is_supported("gfm_markdown"): + self.git_provider.publish_comment( + "The `Help` tool requires gfm markdown, which is not supported by your code platform.") + return + + get_logger().info('Getting PR Help Message...') + relevant_configs = {'pr_help': dict(get_settings().pr_help), + 'config': dict(get_settings().config)} + get_logger().debug("Relevant configs", artifacts=relevant_configs) + pr_comment = "## PR Agent Walkthrough ๐Ÿค–\n\n" + pr_comment += "Welcome to the PR Agent, an AI-powered tool for automated pull request analysis, feedback, suggestions and more.""" + pr_comment += "\n\nHere is a list of tools you can use to interact with the PR Agent:\n" + base_path = "https://pr-agent-docs.codium.ai/tools" + + tool_names = [] + tool_names.append(f"[DESCRIBE]({base_path}/describe/)") + tool_names.append(f"[REVIEW]({base_path}/review/)") + tool_names.append(f"[IMPROVE]({base_path}/improve/)") + tool_names.append(f"[UPDATE CHANGELOG]({base_path}/update_changelog/)") + tool_names.append(f"[ADD DOCS]({base_path}/documentation/) ๐Ÿ’Ž") + tool_names.append(f"[TEST]({base_path}/test/) ๐Ÿ’Ž") + tool_names.append(f"[IMPROVE COMPONENT]({base_path}/improve_component/) ๐Ÿ’Ž") + tool_names.append(f"[ANALYZE]({base_path}/analyze/) ๐Ÿ’Ž") + tool_names.append(f"[ASK]({base_path}/ask/)") + tool_names.append(f"[GENERATE CUSTOM LABELS]({base_path}/custom_labels/) ๐Ÿ’Ž") + tool_names.append(f"[CI FEEDBACK]({base_path}/ci_feedback/) ๐Ÿ’Ž") + tool_names.append(f"[CUSTOM PROMPT]({base_path}/custom_prompt/) ๐Ÿ’Ž") + tool_names.append(f"[SIMILAR ISSUE]({base_path}/similar_issues/)") + + descriptions = [] + descriptions.append("Generates PR description - title, type, summary, code walkthrough and labels") + descriptions.append("Adjustable feedback about the PR, possible issues, security concerns, review effort and more") + descriptions.append("Code suggestions for improving the PR") + descriptions.append("Automatically updates the changelog") + descriptions.append("Generates documentation to methods/functions/classes that changed in the PR") + descriptions.append("Generates unit tests for a specific component, based on the PR code change") + descriptions.append("Code suggestions for a specific component that changed in the PR") + descriptions.append("Identifies code components that changed in the PR, and enables to interactively generate tests, docs, and code suggestions for each component") + descriptions.append("Answering free-text questions about the PR") + descriptions.append("Generates custom labels for the PR, based on specific guidelines defined by the user") + descriptions.append("Generates feedback and analysis for a failed CI job") + descriptions.append("Generates custom suggestions for improving the PR code, derived only from a specific guidelines prompt defined by the user") + descriptions.append("Automatically retrieves and presents similar issues") + + commands =[] + commands.append("`/describe`") + commands.append("`/review`") + commands.append("`/improve`") + commands.append("`/update_changelog`") + commands.append("`/add_docs`") + commands.append("`/test`") + commands.append("`/improve_component`") + commands.append("`/analyze`") + commands.append("`/ask`") + commands.append("`/generate_labels`") + commands.append("`/checks`") + commands.append("`/custom_prompt`") + commands.append("`/similar_issue`") + + checkbox_list = [] + checkbox_list.append(" - [ ] Run <!-- /describe -->") + checkbox_list.append(" - [ ] Run <!-- /review -->") + checkbox_list.append(" - [ ] Run <!-- /improve -->") + checkbox_list.append(" - [ ] Run <!-- /update_changelog -->") + checkbox_list.append(" - [ ] Run <!-- /add_docs -->") + checkbox_list.append(" - [ ] Run <!-- /test -->") + checkbox_list.append(" - [ ] Run <!-- /improve_component -->") + checkbox_list.append(" - [ ] Run <!-- /analyze -->") + checkbox_list.append("[*]") + checkbox_list.append("[*]") + checkbox_list.append("[*]") + checkbox_list.append("[*]") + checkbox_list.append("[*]") + checkbox_list.append("[*]") + checkbox_list.append("[*]") + checkbox_list.append("[*]") + + if isinstance(self.git_provider, GithubProvider) and not get_settings().config.get('disable_checkboxes', False): + pr_comment += f"<table><tr align='left'><th align='left'>Tool</th><th align='left'>Description</th><th align='left'>Trigger Interactively :gem:</th></tr>" + for i in range(len(tool_names)): + pr_comment += f"\n<tr><td align='left'>\n\n<strong>{tool_names[i]}</strong></td>\n<td>{descriptions[i]}</td>\n<td>\n\n{checkbox_list[i]}\n</td></tr>" + pr_comment += "</table>\n\n" + pr_comment += f"""\n\n(1) Note that each tool be [triggered automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) when a new PR is opened, or called manually by [commenting on a PR](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#online-usage).""" + pr_comment += f"""\n\n(2) Tools marked with [*] require additional parameters to be passed. For example, to invoke the `/ask` tool, you need to comment on a PR: `/ask "<question content>"`. See the relevant documentation for each tool for more details.""" + elif isinstance(self.git_provider, BitbucketServerProvider): + # only support basic commands in BBDC + pr_comment = generate_bbdc_table(tool_names[:4], descriptions[:4]) + else: + pr_comment += f"<table><tr align='left'><th align='left'>Tool</th><th align='left'>Command</th><th align='left'>Description</th></tr>" + for i in range(len(tool_names)): + pr_comment += f"\n<tr><td align='left'>\n\n<strong>{tool_names[i]}</strong></td><td>{commands[i]}</td><td>{descriptions[i]}</td></tr>" + pr_comment += "</table>\n\n" + pr_comment += f"""\n\nNote that each tool be [invoked automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/) when a new PR is opened, or called manually by [commenting on a PR](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#online-usage).""" + + if get_settings().config.publish_output: + self.git_provider.publish_comment(pr_comment) + except Exception as e: + get_logger().exception(f"Error while running PRHelpMessage: {e}") + return "" + + async def prepare_relevant_snippets(self, sim_results): + # Get relevant snippets + relevant_snippets_full = [] + relevant_pages_full = [] + relevant_snippets_full_header = [] + th = 0.75 + for s in sim_results: + page = s[0].metadata['source'] + content = s[0].page_content + score = s[1] + relevant_snippets_full.append(content) + relevant_snippets_full_header.append(extract_header(content)) + relevant_pages_full.append(page) + # build the snippets string + relevant_snippets_str = "" + for i, s in enumerate(relevant_snippets_full): + relevant_snippets_str += f"Snippet {i+1}:\n\n{s}\n\n" + relevant_snippets_str += "-------------------\n\n" + return relevant_pages_full, relevant_snippets_full_header, relevant_snippets_str + + +def generate_bbdc_table(column_arr_1, column_arr_2): + # Generating header row + header_row = "| Tool | Description | \n" + + # Generating separator row + separator_row = "|--|--|\n" + + # Generating data rows + data_rows = "" + max_len = max(len(column_arr_1), len(column_arr_2)) + for i in range(max_len): + col1 = column_arr_1[i] if i < len(column_arr_1) else "" + col2 = column_arr_2[i] if i < len(column_arr_2) else "" + data_rows += f"| {col1} | {col2} |\n" + + # Combine all parts to form the complete table + markdown_table = header_row + separator_row + data_rows + return markdown_table diff --git a/pr_agent/tools/pr_information_from_user.py b/pr_agent/tools/pr_information_from_user.py index c049250fd..e5bd2f727 100644 --- a/pr_agent/tools/pr_information_from_user.py +++ b/pr_agent/tools/pr_information_from_user.py @@ -1,23 +1,28 @@ import copy -import logging +from functools import partial from jinja2 import Environment, StrictUndefined -from pr_agent.algo.ai_handler import AiHandler +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger class PRInformationFromUser: - def __init__(self, pr_url: str, args: list = None): + def __init__(self, pr_url: str, args: list = None, + ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): self.git_provider = get_git_provider()(pr_url) self.main_pr_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) - self.ai_handler = AiHandler() + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_pr_language + self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), @@ -34,22 +39,22 @@ def __init__(self, pr_url: str, args: list = None): self.prediction = None async def run(self): - logging.info('Generating question to the user...') + get_logger().info('Generating question to the user...') if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing questions...", is_temporary=True) await retry_with_fallback_models(self._prepare_prediction) - logging.info('Preparing questions...') + get_logger().info('Preparing questions...') pr_comment = self._prepare_pr_answer() if get_settings().config.publish_output: - logging.info('Pushing questions...') + get_logger().info('Pushing questions...') self.git_provider.publish_comment(pr_comment) self.git_provider.remove_initial_comment() return "" async def _prepare_prediction(self, model): - logging.info('Getting PR diff...') + get_logger().info('Getting PR diff...') self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) - logging.info('Getting AI prediction...') + get_logger().info('Getting AI prediction...') self.prediction = await self._get_prediction(model) async def _get_prediction(self, model: str): @@ -59,16 +64,16 @@ async def _get_prediction(self, model: str): system_prompt = environment.from_string(get_settings().pr_information_from_user_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_information_from_user_prompt.user).render(variables) if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, - system=system_prompt, user=user_prompt) + get_logger().info(f"\nSystem prompt:\n{system_prompt}") + get_logger().info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion( + model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) return response def _prepare_pr_answer(self) -> str: model_output = self.prediction.strip() if get_settings().config.verbosity_level >= 2: - logging.info(f"answer_str:\n{model_output}") + get_logger().info(f"answer_str:\n{model_output}") answer_str = f"{model_output}\n\n Please respond to the questions above in the following format:\n\n" +\ "\n>/answer\n>1) ...\n>2) ...\n>...\n" return answer_str diff --git a/pr_agent/tools/pr_line_questions.py b/pr_agent/tools/pr_line_questions.py new file mode 100644 index 000000000..56818e1cd --- /dev/null +++ b/pr_agent/tools/pr_line_questions.py @@ -0,0 +1,107 @@ +import argparse +import copy +from functools import partial + +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, \ + extract_hunk_lines_from_patch +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import ModelType +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger +from pr_agent.servers.help import HelpMessage + + +class PR_LineQuestions: + def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): + self.question_str = self.parse_args(args) + self.git_provider = get_git_provider()(pr_url) + self.main_pr_language = get_main_pr_language( + self.git_provider.get_languages(), self.git_provider.get_files() + ) + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_pr_language + + self.vars = { + "title": self.git_provider.pr.title, + "branch": self.git_provider.get_pr_branch(), + "diff": "", # empty diff for initial calculation + "question": self.question_str, + "full_hunk": "", + "selected_lines": "", + } + self.token_handler = TokenHandler(self.git_provider.pr, + self.vars, + get_settings().pr_line_questions_prompt.system, + get_settings().pr_line_questions_prompt.user) + self.patches_diff = None + self.prediction = None + + def parse_args(self, args): + if args and len(args) > 0: + question_str = " ".join(args) + else: + question_str = "" + return question_str + + + async def run(self): + get_logger().info('Answering a PR lines question...') + # if get_settings().config.publish_output: + # self.git_provider.publish_comment("Preparing answer...", is_temporary=True) + + self.patch_with_lines = "" + ask_diff = get_settings().get('ask_diff_hunk', "") + line_start = get_settings().get('line_start', '') + line_end = get_settings().get('line_end', '') + side = get_settings().get('side', 'RIGHT') + file_name = get_settings().get('file_name', '') + comment_id = get_settings().get('comment_id', '') + if ask_diff: + self.patch_with_lines, self.selected_lines = extract_hunk_lines_from_patch(ask_diff, + file_name, + line_start=line_start, + line_end=line_end, + side=side + ) + else: + diff_files = self.git_provider.get_diff_files() + for file in diff_files: + if file.filename == file_name: + self.patch_with_lines, self.selected_lines = extract_hunk_lines_from_patch(file.patch, file.filename, + line_start=line_start, + line_end=line_end, + side=side) + if self.patch_with_lines: + response = await retry_with_fallback_models(self._get_prediction, model_type=ModelType.TURBO) + + get_logger().info('Preparing answer...') + if comment_id: + self.git_provider.reply_to_comment_from_comment_id(comment_id, response) + else: + self.git_provider.publish_comment(response) + + return "" + + async def _get_prediction(self, model: str): + variables = copy.deepcopy(self.vars) + variables["full_hunk"] = self.patch_with_lines # update diff + variables["selected_lines"] = self.selected_lines + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_line_questions_prompt.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_line_questions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + # get_logger().info(f"\nSystem prompt:\n{system_prompt}") + # get_logger().info(f"\nUser prompt:\n{user_prompt}") + print(f"\nSystem prompt:\n{system_prompt}") + print(f"\nUser prompt:\n{user_prompt}") + + response, finish_reason = await self.ai_handler.chat_completion( + model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) + return response diff --git a/pr_agent/tools/pr_questions.py b/pr_agent/tools/pr_questions.py index 959bebe71..8112510ea 100644 --- a/pr_agent/tools/pr_questions.py +++ b/pr_agent/tools/pr_questions.py @@ -1,24 +1,31 @@ import copy -import logging +from functools import partial from jinja2 import Environment, StrictUndefined -from pr_agent.algo.ai_handler import AiHandler +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import ModelType from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger +from pr_agent.servers.help import HelpMessage class PRQuestions: - def __init__(self, pr_url: str, args=None): + def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): question_str = self.parse_args(args) + self.pr_url = pr_url self.git_provider = get_git_provider()(pr_url) self.main_pr_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) - self.ai_handler = AiHandler() + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_pr_language + self.question_str = question_str self.vars = { "title": self.git_provider.pr.title, @@ -44,23 +51,54 @@ def parse_args(self, args): return question_str async def run(self): - logging.info('Answering a PR question...') + get_logger().info(f'Answering a PR question about the PR {self.pr_url} ') + relevant_configs = {'pr_questions': dict(get_settings().pr_questions), + 'config': dict(get_settings().config)} + get_logger().debug("Relevant configs", artifacts=relevant_configs) if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing answer...", is_temporary=True) - await retry_with_fallback_models(self._prepare_prediction) - logging.info('Preparing answer...') + + # identify image + img_path = self.identify_image_in_comment() + if img_path: + get_logger().debug(f"Image path identified", artifact=img_path) + + await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.TURBO) + pr_comment = self._prepare_pr_answer() + get_logger().debug(f"PR output", artifact=pr_comment) + + if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_questions.enable_help_text: + pr_comment += "<hr>\n\n<details> <summary><strong>๐Ÿ’ก Tool usage guide:</strong></summary><hr> \n\n" + pr_comment += HelpMessage.get_ask_usage_guide() + pr_comment += "\n</details>\n" + if get_settings().config.publish_output: - logging.info('Pushing answer...') self.git_provider.publish_comment(pr_comment) self.git_provider.remove_initial_comment() return "" + def identify_image_in_comment(self): + img_path = '' + if '![image]' in self.question_str: + # assuming structure: + # /ask question ... > ![image](img_path) + img_path = self.question_str.split('![image]')[1].strip().strip('()') + self.vars['img_path'] = img_path + elif 'https://' in self.question_str and ('.png' in self.question_str or 'jpg' in self.question_str): # direct image link + # include https:// in the image path + img_path = 'https://' + self.question_str.split('https://')[1] + self.vars['img_path'] = img_path + return img_path + async def _prepare_prediction(self, model: str): - logging.info('Getting PR diff...') self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) - logging.info('Getting AI prediction...') - self.prediction = await self._get_prediction(model) + if self.patches_diff: + get_logger().debug(f"PR diff", artifact=self.patches_diff) + self.prediction = await self._get_prediction(model) + else: + get_logger().error(f"Error getting PR diff") + self.prediction = "" async def _get_prediction(self, model: str): variables = copy.deepcopy(self.vars) @@ -68,16 +106,17 @@ async def _get_prediction(self, model: str): environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_questions_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_questions_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, - system=system_prompt, user=user_prompt) + if 'img_path' in variables: + img_path = self.vars['img_path'] + response, finish_reason = await (self.ai_handler.chat_completion + (model=model, temperature=get_settings().config.temperature, + system=system_prompt, user=user_prompt, img_path=img_path)) + else: + response, finish_reason = await self.ai_handler.chat_completion( + model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) return response def _prepare_pr_answer(self) -> str: - answer_str = f"Question: {self.question_str}\n\n" - answer_str += f"Answer:\n{self.prediction.strip()}\n\n" - if get_settings().config.verbosity_level >= 2: - logging.info(f"answer_str:\n{answer_str}") + answer_str = f"### **Ask**โ“\n{self.question_str}\n\n" + answer_str += f"### **Answer:**\n{self.prediction.strip()}\n\n" return answer_str diff --git a/pr_agent/tools/pr_reviewer.py b/pr_agent/tools/pr_reviewer.py index fd6479aef..88799d987 100644 --- a/pr_agent/tools/pr_reviewer.py +++ b/pr_agent/tools/pr_reviewer.py @@ -1,68 +1,89 @@ import copy -import json -import logging +import datetime from collections import OrderedDict +from functools import partial from typing import List, Tuple - -import yaml from jinja2 import Environment, StrictUndefined -from yaml import SafeLoader - -from pr_agent.algo.ai_handler import AiHandler -from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, \ - find_line_number_of_relevant_line_in_file, clip_tokens +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, add_ai_metadata_to_diff_files from pr_agent.algo.token_handler import TokenHandler -from pr_agent.algo.utils import convert_to_markdown, try_fix_json, try_fix_yaml, load_yaml +from pr_agent.algo.utils import github_action_output, load_yaml, ModelType, \ + show_relevant_configurations, convert_to_markdown_v2, PRReviewHeader from pr_agent.config_loader import get_settings -from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers import get_git_provider, get_git_provider_with_context from pr_agent.git_providers.git_provider import IncrementalPR, get_main_pr_language -from pr_agent.servers.help import actions_help_text, bot_help_text +from pr_agent.log import get_logger +from pr_agent.servers.help import HelpMessage class PRReviewer: """ The PRReviewer class is responsible for reviewing a pull request and generating feedback using an AI model. """ - def __init__(self, pr_url: str, is_answer: bool = False, args: list = None): + + def __init__(self, pr_url: str, is_answer: bool = False, is_auto: bool = False, args: list = None, + ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): """ Initialize the PRReviewer object with the necessary attributes and objects to review a pull request. Args: pr_url (str): The URL of the pull request to be reviewed. is_answer (bool, optional): Indicates whether the review is being done in answer mode. Defaults to False. + is_auto (bool, optional): Indicates whether the review is being done in automatic mode. Defaults to False. + ai_handler (BaseAiHandler): The AI handler to be used for the review. Defaults to None. args (list, optional): List of arguments passed to the PRReviewer class. Defaults to None. """ - self.parse_args(args) # -i command + self.git_provider = get_git_provider_with_context(pr_url) + self.args = args + self.incremental = self.parse_incremental(args) # -i command + if self.incremental and self.incremental.is_incremental: + self.git_provider.get_incremental_commits(self.incremental) - self.git_provider = get_git_provider()(pr_url, incremental=self.incremental) self.main_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.pr_url = pr_url self.is_answer = is_answer + self.is_auto = is_auto if self.is_answer and not self.git_provider.is_supported("get_issue_comments"): raise Exception(f"Answer mode is not supported for {get_settings().config.git_provider} for now") - self.ai_handler = AiHandler() + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_language self.patches_diff = None self.prediction = None - answer_str, question_str = self._get_user_answers() + self.pr_description, self.pr_description_files = ( + self.git_provider.get_pr_description(split_changes_walkthrough=True)) + if (self.pr_description_files and get_settings().get("config.is_auto_command", False) and + get_settings().get("config.enable_ai_metadata", False)): + add_ai_metadata_to_diff_files(self.git_provider, self.pr_description_files) + get_logger().debug(f"AI metadata added to the this command") + else: + get_settings().set("config.enable_ai_metadata", False) + get_logger().debug(f"AI metadata is disabled for this command") + self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), - "description": self.git_provider.get_pr_description(), + "description": self.pr_description, "language": self.main_language, "diff": "", # empty diff for initial calculation + "num_pr_files": self.git_provider.get_num_of_files(), "require_score": get_settings().pr_reviewer.require_score_review, "require_tests": get_settings().pr_reviewer.require_tests_review, - "require_security": get_settings().pr_reviewer.require_security_review, - "require_focused": get_settings().pr_reviewer.require_focused_review, + "require_estimate_effort_to_review": get_settings().pr_reviewer.require_estimate_effort_to_review, + 'require_can_be_split_review': get_settings().pr_reviewer.require_can_be_split_review, + 'require_security_review': get_settings().pr_reviewer.require_security_review, 'num_code_suggestions': get_settings().pr_reviewer.num_code_suggestions, 'question_str': question_str, 'answer_str': answer_str, "extra_instructions": get_settings().pr_reviewer.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), + "custom_labels": "", + "enable_custom_labels": get_settings().config.enable_custom_labels, + "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), } self.token_handler = TokenHandler( @@ -72,60 +93,85 @@ def __init__(self, pr_url: str, is_answer: bool = False, args: list = None): get_settings().pr_review_prompt.user ) - def parse_args(self, args: List[str]) -> None: - """ - Parse the arguments passed to the PRReviewer class and set the 'incremental' attribute accordingly. - - Args: - args: A list of arguments passed to the PRReviewer class. - - Returns: - None - """ + def parse_incremental(self, args: List[str]): is_incremental = False if args and len(args) >= 1: arg = args[0] if arg == "-i": is_incremental = True - self.incremental = IncrementalPR(is_incremental) + incremental = IncrementalPR(is_incremental) + return incremental async def run(self) -> None: - """ - Review the pull request and generate feedback. - """ - logging.info('Reviewing PR...') - - if get_settings().config.publish_output: - self.git_provider.publish_comment("Preparing review...", is_temporary=True) - - await retry_with_fallback_models(self._prepare_prediction) - - logging.info('Preparing PR review...') - pr_comment = self._prepare_pr_review() - - if get_settings().config.publish_output: - logging.info('Pushing PR review...') - self.git_provider.publish_comment(pr_comment) - self.git_provider.remove_initial_comment() - - if get_settings().pr_reviewer.inline_code_comments: - logging.info('Pushing inline code comments...') - self._publish_inline_code_comments() + try: + if not self.git_provider.get_files(): + get_logger().info(f"PR has no files: {self.pr_url}, skipping review") + return None + + if self.incremental.is_incremental and not self._can_run_incremental_review(): + return None + + if isinstance(self.args, list) and self.args and self.args[0] == 'auto_approve': + get_logger().info(f'Auto approve flow PR: {self.pr_url} ...') + self.auto_approve_logic() + return None + + get_logger().info(f'Reviewing PR: {self.pr_url} ...') + relevant_configs = {'pr_reviewer': dict(get_settings().pr_reviewer), + 'config': dict(get_settings().config)} + get_logger().debug("Relevant configs", artifacts=relevant_configs) + + if self.incremental.is_incremental and hasattr(self.git_provider, "unreviewed_files_set") and not self.git_provider.unreviewed_files_set: + get_logger().info(f"Incremental review is enabled for {self.pr_url} but there are no new files") + previous_review_url = "" + if hasattr(self.git_provider, "previous_review"): + previous_review_url = self.git_provider.previous_review.html_url + if get_settings().config.publish_output: + self.git_provider.publish_comment(f"Incremental Review Skipped\n" + f"No files were changed since the [previous PR Review]({previous_review_url})") + return None + + if get_settings().config.publish_output and not get_settings().config.get('is_auto_command', False): + self.git_provider.publish_comment("Preparing review...", is_temporary=True) + + await retry_with_fallback_models(self._prepare_prediction) + if not self.prediction: + self.git_provider.remove_initial_comment() + return None + + pr_review = self._prepare_pr_review() + get_logger().debug(f"PR output", artifact=pr_review) + + if get_settings().config.publish_output: + # publish the review + if get_settings().pr_reviewer.persistent_comment and not self.incremental.is_incremental: + final_update_message = get_settings().pr_reviewer.final_update_message + self.git_provider.publish_persistent_comment(pr_review, + initial_header=f"{PRReviewHeader.REGULAR.value} ๐Ÿ”", + update_header=True, + final_update_message=final_update_message, ) + else: + self.git_provider.publish_comment(pr_review) + + self.git_provider.remove_initial_comment() + if get_settings().pr_reviewer.inline_code_comments: + self._publish_inline_code_comments() + except Exception as e: + get_logger().error(f"Failed to review PR: {e}") async def _prepare_prediction(self, model: str) -> None: - """ - Prepare the AI prediction for the pull request review. - - Args: - model: A string representing the AI model to be used for the prediction. - - Returns: - None - """ - logging.info('Getting PR diff...') - self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) - logging.info('Getting AI prediction...') - self.prediction = await self._get_prediction(model) + self.patches_diff = get_pr_diff(self.git_provider, + self.token_handler, + model, + add_line_numbers_to_hunks=True, + disable_extra_lines=False,) + + if self.patches_diff: + get_logger().debug(f"PR diff", diff=self.patches_diff) + self.prediction = await self._get_prediction(model) + else: + get_logger().warning(f"Empty diff for PR: {self.pr_url}") + self.prediction = None async def _get_prediction(self, model: str) -> str: """ @@ -144,13 +190,9 @@ async def _get_prediction(self, model: str) -> str: system_prompt = environment.from_string(get_settings().pr_review_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_review_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion( model=model, - temperature=0.2, + temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt ) @@ -162,67 +204,68 @@ def _prepare_pr_review(self) -> str: Prepare the PR review by processing the AI prediction and generating a markdown-formatted text that summarizes the feedback. """ - data = load_yaml(self.prediction.strip()) - - # Move 'Security concerns' key to 'PR Analysis' section for better display - pr_feedback = data.get('PR Feedback', {}) - security_concerns = pr_feedback.get('Security concerns') - if security_concerns is not None: - del pr_feedback['Security concerns'] - if type(security_concerns) == bool and security_concerns == False: - data.setdefault('PR Analysis', {})['Security concerns'] = 'No security concerns found' - else: - data.setdefault('PR Analysis', {})['Security concerns'] = security_concerns - - # - if 'Code feedback' in pr_feedback: - code_feedback = pr_feedback['Code feedback'] + first_key = 'review' + last_key = 'security_concerns' + data = load_yaml(self.prediction.strip(), + keys_fix_yaml=["estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", + "relevant_file:", "relevant_line:", "suggestion:"], + first_key=first_key, last_key=last_key) + github_action_output(data, 'review') + + # move data['review'] 'key_issues_to_review' key to the end of the dictionary + if 'key_issues_to_review' in data['review']: + key_issues_to_review = data['review'].pop('key_issues_to_review') + data['review']['key_issues_to_review'] = key_issues_to_review + + if 'code_feedback' in data: + code_feedback = data['code_feedback'] # Filter out code suggestions that can be submitted as inline comments if get_settings().pr_reviewer.inline_code_comments: - del pr_feedback['Code feedback'] + del data['code_feedback'] else: for suggestion in code_feedback: - if ('relevant file' in suggestion) and (not suggestion['relevant file'].startswith('``')): - suggestion['relevant file'] = f"``{suggestion['relevant file']}``" + if ('relevant_file' in suggestion) and (not suggestion['relevant_file'].startswith('``')): + suggestion['relevant_file'] = f"``{suggestion['relevant_file']}``" - if 'relevant line' not in suggestion: - suggestion['relevant line'] = '' + if 'relevant_line' not in suggestion: + suggestion['relevant_line'] = '' - relevant_line_str = suggestion['relevant line'].split('\n')[0] + relevant_line_str = suggestion['relevant_line'].split('\n')[0] # removing '+' - suggestion['relevant line'] = relevant_line_str.lstrip('+').strip() + suggestion['relevant_line'] = relevant_line_str.lstrip('+').strip() # try to add line numbers link to code suggestions if hasattr(self.git_provider, 'generate_link_to_relevant_line_number'): link = self.git_provider.generate_link_to_relevant_line_number(suggestion) if link: - suggestion['relevant line'] = f"[{suggestion['relevant line']}]({link})" + suggestion['relevant_line'] = f"[{suggestion['relevant_line']}]({link})" + else: + pass + incremental_review_markdown_text = None # Add incremental review section if self.incremental.is_incremental: last_commit_url = f"{self.git_provider.get_pr_url()}/commits/" \ f"{self.git_provider.incremental.first_new_commit_sha}" - data = OrderedDict(data) - data.update({'Incremental PR Review': { - "โฎ๏ธ Review for commits since previous PR-Agent review": f"Starting from commit {last_commit_url}"}}) - data.move_to_end('Incremental PR Review', last=False) - - markdown_text = convert_to_markdown(data) - user = self.git_provider.get_user_id() - - # Add help text if not in CLI mode - if not get_settings().get("CONFIG.CLI_MODE", False): - markdown_text += "\n### How to use\n" - if user and '[bot]' not in user: - markdown_text += bot_help_text(user) - else: - markdown_text += actions_help_text + incremental_review_markdown_text = f"Starting from commit {last_commit_url}" + + markdown_text = convert_to_markdown_v2(data, self.git_provider.is_supported("gfm_markdown"), + incremental_review_markdown_text, git_provider=self.git_provider) + + # Add help text if gfm_markdown is supported + if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_reviewer.enable_help_text: + markdown_text += "<hr>\n\n<details> <summary><strong>๐Ÿ’ก Tool usage guide:</strong></summary><hr> \n\n" + markdown_text += HelpMessage.get_review_usage_guide() + markdown_text += "\n</details>\n" - # Log markdown response if verbosity level is high - if get_settings().config.verbosity_level >= 2: - logging.info(f"Markdown response:\n{markdown_text}") + # Output the relevant configurations if enabled + if get_settings().get('config', {}).get('output_relevant_configurations', False): + markdown_text += show_relevant_configurations(relevant_section='pr_reviewer') + + # Add custom labels from the review prediction (effort, security) + self.set_review_labels(data) if markdown_text == None or len(markdown_text) == 0: markdown_text = "" @@ -236,21 +279,19 @@ def _publish_inline_code_comments(self) -> None: if get_settings().pr_reviewer.num_code_suggestions == 0: return - review_text = self.prediction.strip() - review_text = review_text.removeprefix('```yaml').rstrip('`') - try: - data = yaml.load(review_text, Loader=SafeLoader) - except Exception as e: - logging.error(f"Failed to parse AI prediction: {e}") - data = try_fix_yaml(review_text) - + first_key = 'review' + last_key = 'security_concerns' + data = load_yaml(self.prediction.strip(), + keys_fix_yaml=["estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", + "relevant_file:", "relevant_line:", "suggestion:"], + first_key=first_key, last_key=last_key) comments: List[str] = [] - for suggestion in data.get('PR Feedback', {}).get('Code feedback', []): - relevant_file = suggestion.get('relevant file', '').strip() - relevant_line_in_file = suggestion.get('relevant line', '').strip() + for suggestion in data.get('code_feedback', []): + relevant_file = suggestion.get('relevant_file', '').strip() + relevant_line_in_file = suggestion.get('relevant_line', '').strip() content = suggestion.get('suggestion', '') if not relevant_file or not relevant_line_in_file or not content: - logging.info("Skipping inline comment with missing file/line/content") + get_logger().info("Skipping inline comment with missing file/line/content") continue if self.git_provider.is_supported("create_inline_comment"): @@ -258,7 +299,7 @@ def _publish_inline_code_comments(self) -> None: if comment: comments.append(comment) else: - self.git_provider.publish_inline_comment(content, relevant_file, relevant_line_in_file) + self.git_provider.publish_inline_comment(content, relevant_file, relevant_line_in_file, suggestion) if comments: self.git_provider.publish_inline_comments(comments) @@ -286,3 +327,132 @@ def _get_user_answers(self) -> Tuple[str, str]: break return question_str, answer_str + + def _get_previous_review_comment(self): + """ + Get the previous review comment if it exists. + """ + try: + if hasattr(self.git_provider, "get_previous_review"): + return self.git_provider.get_previous_review( + full=not self.incremental.is_incremental, + incremental=self.incremental.is_incremental, + ) + except Exception as e: + get_logger().exception(f"Failed to get previous review comment, error: {e}") + + def _remove_previous_review_comment(self, comment): + """ + Remove the previous review comment if it exists. + """ + try: + if comment: + self.git_provider.remove_comment(comment) + except Exception as e: + get_logger().exception(f"Failed to remove previous review comment, error: {e}") + + def _can_run_incremental_review(self) -> bool: + """Checks if we can run incremental review according the various configurations and previous review""" + # checking if running is auto mode but there are no new commits + if self.is_auto and not self.incremental.first_new_commit_sha: + get_logger().info(f"Incremental review is enabled for {self.pr_url} but there are no new commits") + return False + + if not hasattr(self.git_provider, "get_incremental_commits"): + get_logger().info(f"Incremental review is not supported for {get_settings().config.git_provider}") + return False + # checking if there are enough commits to start the review + num_new_commits = len(self.incremental.commits_range) + num_commits_threshold = get_settings().pr_reviewer.minimal_commits_for_incremental_review + not_enough_commits = num_new_commits < num_commits_threshold + # checking if the commits are not too recent to start the review + recent_commits_threshold = datetime.datetime.now() - datetime.timedelta( + minutes=get_settings().pr_reviewer.minimal_minutes_for_incremental_review + ) + last_seen_commit_date = ( + self.incremental.last_seen_commit.commit.author.date if self.incremental.last_seen_commit else None + ) + all_commits_too_recent = ( + last_seen_commit_date > recent_commits_threshold if self.incremental.last_seen_commit else False + ) + # check all the thresholds or just one to start the review + condition = any if get_settings().pr_reviewer.require_all_thresholds_for_incremental_review else all + if condition((not_enough_commits, all_commits_too_recent)): + get_logger().info( + f"Incremental review is enabled for {self.pr_url} but didn't pass the threshold check to run:" + f"\n* Number of new commits = {num_new_commits} (threshold is {num_commits_threshold})" + f"\n* Last seen commit date = {last_seen_commit_date} (threshold is {recent_commits_threshold})" + ) + return False + return True + + def set_review_labels(self, data): + if not get_settings().config.publish_output: + return + + if not get_settings().pr_reviewer.require_estimate_effort_to_review: + get_settings().pr_reviewer.enable_review_labels_effort = False # we did not generate this output + if not get_settings().pr_reviewer.require_security_review: + get_settings().pr_reviewer.enable_review_labels_security = False # we did not generate this output + + if (get_settings().pr_reviewer.enable_review_labels_security or + get_settings().pr_reviewer.enable_review_labels_effort): + try: + review_labels = [] + if get_settings().pr_reviewer.enable_review_labels_effort: + estimated_effort = data['review']['estimated_effort_to_review_[1-5]'] + estimated_effort_number = int(estimated_effort.split(',')[0]) + if 1 <= estimated_effort_number <= 5: # 1, because ... + review_labels.append(f'Review effort [1-5]: {estimated_effort_number}') + if get_settings().pr_reviewer.enable_review_labels_security and get_settings().pr_reviewer.require_security_review: + security_concerns = data['review']['security_concerns'] # yes, because ... + security_concerns_bool = 'yes' in security_concerns.lower() or 'true' in security_concerns.lower() + if security_concerns_bool: + review_labels.append('Possible security concern') + + current_labels = self.git_provider.get_pr_labels(update=True) + if not current_labels: + current_labels = [] + get_logger().debug(f"Current labels:\n{current_labels}") + if current_labels: + current_labels_filtered = [label for label in current_labels if + not label.lower().startswith('review effort [1-5]:') and not label.lower().startswith( + 'possible security concern')] + else: + current_labels_filtered = [] + new_labels = review_labels + current_labels_filtered + if (current_labels or review_labels) and sorted(new_labels) != sorted(current_labels): + get_logger().info(f"Setting review labels:\n{review_labels + current_labels_filtered}") + self.git_provider.publish_labels(new_labels) + else: + get_logger().info(f"Review labels are already set:\n{review_labels + current_labels_filtered}") + except Exception as e: + get_logger().error(f"Failed to set review labels, error: {e}") + + def auto_approve_logic(self): + """ + Auto-approve a pull request if it meets the conditions for auto-approval. + """ + if get_settings().pr_reviewer.enable_auto_approval: + maximal_review_effort = get_settings().pr_reviewer.maximal_review_effort + if maximal_review_effort < 5: + current_labels = self.git_provider.get_pr_labels() + for label in current_labels: + if label.lower().startswith('review effort [1-5]:'): + effort = int(label.split(':')[1].strip()) + if effort > maximal_review_effort: + get_logger().info( + f"Auto-approve error: PR review effort ({effort}) is higher than the maximal review effort " + f"({maximal_review_effort}) allowed") + self.git_provider.publish_comment( + f"Auto-approve error: PR review effort ({effort}) is higher than the maximal review effort " + f"({maximal_review_effort}) allowed") + return + is_auto_approved = self.git_provider.auto_approve() + if is_auto_approved: + get_logger().info("Auto-approved PR") + self.git_provider.publish_comment("Auto-approved PR") + else: + get_logger().info("Auto-approval option is disabled") + self.git_provider.publish_comment("Auto-approval option for PR-Agent is disabled. " + "You can enable it via a [configuration file](https://github.com/Codium-ai/pr-agent/blob/main/docs/REVIEW.md#auto-approval-1)") diff --git a/pr_agent/tools/pr_similar_issue.py b/pr_agent/tools/pr_similar_issue.py new file mode 100644 index 000000000..1a4d794ab --- /dev/null +++ b/pr_agent/tools/pr_similar_issue.py @@ -0,0 +1,486 @@ +import time +from enum import Enum +from typing import List + +import openai +from pydantic import BaseModel, Field + +from pr_agent.algo import MAX_TOKENS +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import get_max_tokens +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.log import get_logger + +MODEL = "text-embedding-ada-002" + + +class PRSimilarIssue: + def __init__(self, issue_url: str, ai_handler, args: list = None): + if get_settings().config.git_provider != "github": + raise Exception("Only github is supported for similar issue tool") + + self.cli_mode = get_settings().CONFIG.CLI_MODE + self.max_issues_to_scan = get_settings().pr_similar_issue.max_issues_to_scan + self.issue_url = issue_url + self.git_provider = get_git_provider()() + repo_name, issue_number = self.git_provider._parse_issue_url(issue_url.split('=')[-1]) + self.git_provider.repo = repo_name + self.git_provider.repo_obj = self.git_provider.github_client.get_repo(repo_name) + self.token_handler = TokenHandler() + repo_obj = self.git_provider.repo_obj + repo_name_for_index = self.repo_name_for_index = repo_obj.full_name.lower().replace('/', '-').replace('_/', '-') + index_name = self.index_name = "codium-ai-pr-agent-issues" + + if get_settings().pr_similar_issue.vectordb == "pinecone": + try: + import pinecone + from pinecone_datasets import Dataset, DatasetMetadata + import pandas as pd + except: + raise Exception("Please install 'pinecone' and 'pinecone_datasets' to use pinecone as vectordb") + # assuming pinecone api key and environment are set in secrets file + try: + api_key = get_settings().pinecone.api_key + environment = get_settings().pinecone.environment + except Exception: + if not self.cli_mode: + repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) + issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) + issue_main.create_comment("Please set pinecone api key and environment in secrets file") + raise Exception("Please set pinecone api key and environment in secrets file") + + # check if index exists, and if repo is already indexed + run_from_scratch = False + if run_from_scratch: # for debugging + pinecone.init(api_key=api_key, environment=environment) + if index_name in pinecone.list_indexes(): + get_logger().info('Removing index...') + pinecone.delete_index(index_name) + get_logger().info('Done') + + upsert = True + pinecone.init(api_key=api_key, environment=environment) + if not index_name in pinecone.list_indexes(): + run_from_scratch = True + upsert = False + else: + if get_settings().pr_similar_issue.force_update_dataset: + upsert = True + else: + pinecone_index = pinecone.Index(index_name=index_name) + res = pinecone_index.fetch([f"example_issue_{repo_name_for_index}"]).to_dict() + if res["vectors"]: + upsert = False + + if run_from_scratch or upsert: # index the entire repo + get_logger().info('Indexing the entire repo...') + + get_logger().info('Getting issues...') + issues = list(repo_obj.get_issues(state='all')) + get_logger().info('Done') + self._update_index_with_issues(issues, repo_name_for_index, upsert=upsert) + else: # update index if needed + pinecone_index = pinecone.Index(index_name=index_name) + issues_to_update = [] + issues_paginated_list = repo_obj.get_issues(state='all') + counter = 1 + for issue in issues_paginated_list: + if issue.pull_request: + continue + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + id = issue_key + "." + "issue" + res = pinecone_index.fetch([id]).to_dict() + is_new_issue = True + for vector in res["vectors"].values(): + if vector['metadata']['repo'] == repo_name_for_index: + is_new_issue = False + break + if is_new_issue: + counter += 1 + issues_to_update.append(issue) + else: + break + + if issues_to_update: + get_logger().info(f'Updating index with {counter} new issues...') + self._update_index_with_issues(issues_to_update, repo_name_for_index, upsert=True) + else: + get_logger().info('No new issues to update') + + elif get_settings().pr_similar_issue.vectordb == "lancedb": + try: + import lancedb # import lancedb only if needed + except: + raise Exception("Please install lancedb to use lancedb as vectordb") + self.db = lancedb.connect(get_settings().lancedb.uri) + self.table = None + + run_from_scratch = False + if run_from_scratch: # for debugging + if index_name in self.db.table_names(): + get_logger().info('Removing Table...') + self.db.drop_table(index_name) + get_logger().info('Done') + + ingest = True + if index_name not in self.db.table_names(): + run_from_scratch = True + ingest = False + else: + if get_settings().pr_similar_issue.force_update_dataset: + ingest = True + else: + self.table = self.db[index_name] + res = self.table.search().limit(len(self.table)).where(f"id='example_issue_{repo_name_for_index}'").to_list() + get_logger().info("result: ", res) + if res[0].get("vector"): + ingest = False + + if run_from_scratch or ingest: # indexing the entire repo + get_logger().info('Indexing the entire repo...') + + get_logger().info('Getting issues...') + issues = list(repo_obj.get_issues(state='all')) + get_logger().info('Done') + + self._update_table_with_issues(issues, repo_name_for_index, ingest=ingest) + else: # update table if needed + issues_to_update = [] + issues_paginated_list = repo_obj.get_issues(state='all') + counter = 1 + for issue in issues_paginated_list: + if issue.pull_request: + continue + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + issue_id = issue_key + "." + "issue" + res = self.table.search().limit(len(self.table)).where(f"id='{issue_id}'").to_list() + is_new_issue = True + for r in res: + if r['metadata']['repo'] == repo_name_for_index: + is_new_issue = False + break + if is_new_issue: + counter += 1 + issues_to_update.append(issue) + else: + break + + if issues_to_update: + get_logger().info(f'Updating index with {counter} new issues...') + self._update_table_with_issues(issues_to_update, repo_name_for_index, ingest=True) + else: + get_logger().info('No new issues to update') + + + async def run(self): + get_logger().info('Getting issue...') + repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) + issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) + issue_str, comments, number = self._process_issue(issue_main) + openai.api_key = get_settings().openai.key + get_logger().info('Done') + + get_logger().info('Querying...') + res = openai.Embedding.create(input=[issue_str], engine=MODEL) + embeds = [record['embedding'] for record in res['data']] + + relevant_issues_number_list = [] + relevant_comment_number_list = [] + score_list = [] + + if get_settings().pr_similar_issue.vectordb == "pinecone": + pinecone_index = pinecone.Index(index_name=self.index_name) + res = pinecone_index.query(embeds[0], + top_k=5, + filter={"repo": self.repo_name_for_index}, + include_metadata=True).to_dict() + + for r in res['matches']: + # skip example issue + if 'example_issue_' in r["id"]: + continue + + try: + issue_number = int(r["id"].split('.')[0].split('_')[-1]) + except: + get_logger().debug(f"Failed to parse issue number from {r['id']}") + continue + + if original_issue_number == issue_number: + continue + if issue_number not in relevant_issues_number_list: + relevant_issues_number_list.append(issue_number) + if 'comment' in r["id"]: + relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) + else: + relevant_comment_number_list.append(-1) + score_list.append(str("{:.2f}".format(r['score']))) + get_logger().info('Done') + + elif get_settings().pr_similar_issue.vectordb == "lancedb": + res = self.table.search(embeds[0]).where(f"metadata.repo='{self.repo_name_for_index}'", prefilter=True).to_list() + + for r in res: + # skip example issue + if 'example_issue_' in r["id"]: + continue + + try: + issue_number = int(r["id"].split('.')[0].split('_')[-1]) + except: + get_logger().debug(f"Failed to parse issue number from {r['id']}") + continue + + if original_issue_number == issue_number: + continue + if issue_number not in relevant_issues_number_list: + relevant_issues_number_list.append(issue_number) + + if 'comment' in r["id"]: + relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) + else: + relevant_comment_number_list.append(-1) + score_list.append(str("{:.2f}".format(1-r['_distance']))) + get_logger().info('Done') + + get_logger().info('Publishing response...') + similar_issues_str = "### Similar Issues\n___\n\n" + + for i, issue_number_similar in enumerate(relevant_issues_number_list): + issue = self.git_provider.repo_obj.get_issue(issue_number_similar) + title = issue.title + url = issue.html_url + if relevant_comment_number_list[i] != -1: + url = list(issue.get_comments())[relevant_comment_number_list[i]].html_url + similar_issues_str += f"{i + 1}. **[{title}]({url})** (score={score_list[i]})\n\n" + if get_settings().config.publish_output: + response = issue_main.create_comment(similar_issues_str) + get_logger().info(similar_issues_str) + get_logger().info('Done') + + def _process_issue(self, issue): + header = issue.title + body = issue.body + number = issue.number + if get_settings().pr_similar_issue.skip_comments: + comments = [] + else: + comments = list(issue.get_comments()) + issue_str = f"Issue Header: \"{header}\"\n\nIssue Body:\n{body}" + return issue_str, comments, number + + def _update_index_with_issues(self, issues_list, repo_name_for_index, upsert=False): + get_logger().info('Processing issues...') + corpus = Corpus() + example_issue_record = Record( + id=f"example_issue_{repo_name_for_index}", + text="example_issue", + metadata=Metadata(repo=repo_name_for_index) + ) + corpus.append(example_issue_record) + + counter = 0 + for issue in issues_list: + if issue.pull_request: + continue + + counter += 1 + if counter % 100 == 0: + get_logger().info(f"Scanned {counter} issues") + if counter >= self.max_issues_to_scan: + get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") + break + + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + username = issue.user.login + created_at = str(issue.created_at) + if len(issue_str) < 8000 or \ + self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first + issue_record = Record( + id=issue_key + "." + "issue", + text=issue_str, + metadata=Metadata(repo=repo_name_for_index, + username=username, + created_at=created_at, + level=IssueLevel.ISSUE) + ) + corpus.append(issue_record) + if comments: + for j, comment in enumerate(comments): + comment_body = comment.body + num_words_comment = len(comment_body.split()) + if num_words_comment < 10 or not isinstance(comment_body, str): + continue + + if len(comment_body) < 8000 or \ + self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: + comment_record = Record( + id=issue_key + ".comment_" + str(j + 1), + text=comment_body, + metadata=Metadata(repo=repo_name_for_index, + username=username, # use issue username for all comments + created_at=created_at, + level=IssueLevel.COMMENT) + ) + corpus.append(comment_record) + df = pd.DataFrame(corpus.dict()["documents"]) + get_logger().info('Done') + + get_logger().info('Embedding...') + openai.api_key = get_settings().openai.key + list_to_encode = list(df["text"].values) + try: + res = openai.Embedding.create(input=list_to_encode, engine=MODEL) + embeds = [record['embedding'] for record in res['data']] + except: + embeds = [] + get_logger().error('Failed to embed entire list, embedding one by one...') + for i, text in enumerate(list_to_encode): + try: + res = openai.Embedding.create(input=[text], engine=MODEL) + embeds.append(res['data'][0]['embedding']) + except: + embeds.append([0] * 1536) + df["values"] = embeds + meta = DatasetMetadata.empty() + meta.dense_model.dimension = len(embeds[0]) + ds = Dataset.from_pandas(df, meta) + get_logger().info('Done') + + api_key = get_settings().pinecone.api_key + environment = get_settings().pinecone.environment + if not upsert: + get_logger().info('Creating index from scratch...') + ds.to_pinecone_index(self.index_name, api_key=api_key, environment=environment) + time.sleep(15) # wait for pinecone to finalize indexing before querying + else: + get_logger().info('Upserting index...') + namespace = "" + batch_size: int = 100 + concurrency: int = 10 + pinecone.init(api_key=api_key, environment=environment) + ds._upsert_to_index(self.index_name, namespace, batch_size, concurrency) + time.sleep(5) # wait for pinecone to finalize upserting before querying + get_logger().info('Done') + + def _update_table_with_issues(self, issues_list, repo_name_for_index, ingest=False): + get_logger().info('Processing issues...') + + corpus = Corpus() + example_issue_record = Record( + id=f"example_issue_{repo_name_for_index}", + text="example_issue", + metadata=Metadata(repo=repo_name_for_index) + ) + corpus.append(example_issue_record) + + counter = 0 + for issue in issues_list: + if issue.pull_request: + continue + + counter += 1 + if counter % 100 == 0: + get_logger().info(f"Scanned {counter} issues") + if counter >= self.max_issues_to_scan: + get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") + break + + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + username = issue.user.login + created_at = str(issue.created_at) + if len(issue_str) < 8000 or \ + self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first + issue_record = Record( + id=issue_key + "." + "issue", + text=issue_str, + metadata=Metadata(repo=repo_name_for_index, + username=username, + created_at=created_at, + level=IssueLevel.ISSUE) + ) + corpus.append(issue_record) + if comments: + for j, comment in enumerate(comments): + comment_body = comment.body + num_words_comment = len(comment_body.split()) + if num_words_comment < 10 or not isinstance(comment_body, str): + continue + + if len(comment_body) < 8000 or \ + self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: + comment_record = Record( + id=issue_key + ".comment_" + str(j + 1), + text=comment_body, + metadata=Metadata(repo=repo_name_for_index, + username=username, # use issue username for all comments + created_at=created_at, + level=IssueLevel.COMMENT) + ) + corpus.append(comment_record) + df = pd.DataFrame(corpus.dict()["documents"]) + get_logger().info('Done') + + get_logger().info('Embedding...') + openai.api_key = get_settings().openai.key + list_to_encode = list(df["text"].values) + try: + res = openai.Embedding.create(input=list_to_encode, engine=MODEL) + embeds = [record['embedding'] for record in res['data']] + except: + embeds = [] + get_logger().error('Failed to embed entire list, embedding one by one...') + for i, text in enumerate(list_to_encode): + try: + res = openai.Embedding.create(input=[text], engine=MODEL) + embeds.append(res['data'][0]['embedding']) + except: + embeds.append([0] * 1536) + df["vector"] = embeds + get_logger().info('Done') + + if not ingest: + get_logger().info('Creating table from scratch...') + self.table = self.db.create_table(self.index_name, data=df, mode="overwrite") + time.sleep(15) + else: + get_logger().info('Ingesting in Table...') + if self.index_name not in self.db.table_names(): + self.table.add(df) + else: + get_logger().info(f"Table {self.index_name} doesn't exists!") + time.sleep(5) + get_logger().info('Done') + + +class IssueLevel(str, Enum): + ISSUE = "issue" + COMMENT = "comment" + + +class Metadata(BaseModel): + repo: str + username: str = Field(default="@codium") + created_at: str = Field(default="01-01-1970 00:00:00.00000") + level: IssueLevel = Field(default=IssueLevel.ISSUE) + + class Config: + use_enum_values = True + + +class Record(BaseModel): + id: str + text: str + metadata: Metadata + + +class Corpus(BaseModel): + documents: List[Record] = Field(default=[]) + + def append(self, r: Record): + self.documents.append(r) diff --git a/pr_agent/tools/pr_update_changelog.py b/pr_agent/tools/pr_update_changelog.py index 1ec627095..dfe90a0df 100644 --- a/pr_agent/tools/pr_update_changelog.py +++ b/pr_agent/tools/pr_update_changelog.py @@ -1,31 +1,35 @@ import copy -import logging from datetime import date +from functools import partial from time import sleep from typing import Tuple - from jinja2 import Environment, StrictUndefined - -from pr_agent.algo.ai_handler import AiHandler +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import ModelType, show_relevant_configurations from pr_agent.config_loader import get_settings -from pr_agent.git_providers import GithubProvider, get_git_provider +from pr_agent.git_providers import get_git_provider, GithubProvider from pr_agent.git_providers.git_provider import get_main_pr_language +from pr_agent.log import get_logger CHANGELOG_LINES = 50 class PRUpdateChangelog: - def __init__(self, pr_url: str, cli_mode=False, args=None): + def __init__(self, pr_url: str, cli_mode=False, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): self.git_provider = get_git_provider()(pr_url) self.main_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.commit_changelog = get_settings().pr_update_changelog.push_changelog_changes - self._get_changlog_file() # self.changelog_file_str - self.ai_handler = AiHandler() + self._get_changelog_file() # self.changelog_file_str + + self.ai_handler = ai_handler() + self.ai_handler.main_pr_language = self.main_language + self.patches_diff = None self.prediction = None self.cli_mode = cli_mode @@ -46,29 +50,52 @@ def __init__(self, pr_url: str, cli_mode=False, args=None): get_settings().pr_update_changelog_prompt.user) async def run(self): - assert type(self.git_provider) == GithubProvider, "Currently only Github is supported" + get_logger().info('Updating the changelog...') + relevant_configs = {'pr_update_changelog': dict(get_settings().pr_update_changelog), + 'config': dict(get_settings().config)} + get_logger().debug("Relevant configs", artifacts=relevant_configs) + + # currently only GitHub is supported for pushing changelog changes + if get_settings().pr_update_changelog.push_changelog_changes and not hasattr( + self.git_provider, "create_or_update_pr_file" + ): + get_logger().error( + "Pushing changelog changes is not currently supported for this code platform" + ) + if get_settings().config.publish_output: + self.git_provider.publish_comment( + "Pushing changelog changes is not currently supported for this code platform" + ) + return - logging.info('Updating the changelog...') if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing changelog updates...", is_temporary=True) - await retry_with_fallback_models(self._prepare_prediction) - logging.info('Preparing PR changelog updates...') + + await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.TURBO) + new_file_content, answer = self._prepare_changelog_update() + + # Output the relevant configurations if enabled + if get_settings().get('config', {}).get('output_relevant_configurations', False): + answer += show_relevant_configurations(relevant_section='pr_update_changelog') + + get_logger().debug(f"PR output", artifact=answer) + if get_settings().config.publish_output: self.git_provider.remove_initial_comment() - logging.info('Publishing changelog updates...') if self.commit_changelog: - logging.info('Pushing PR changelog updates to repo...') self._push_changelog_update(new_file_content, answer) else: - logging.info('Publishing PR changelog as comment...') - self.git_provider.publish_comment(f"**Changelog updates:**\n\n{answer}") + self.git_provider.publish_comment(f"**Changelog updates:** ๐Ÿ”„\n\n{answer}") async def _prepare_prediction(self, model: str): - logging.info('Getting PR diff...') self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) - logging.info('Getting AI prediction...') - self.prediction = await self._get_prediction(model) + if self.patches_diff: + get_logger().debug(f"PR diff", artifact=self.patches_diff) + self.prediction = await self._get_prediction(model) + else: + get_logger().error(f"Error getting PR diff") + self.prediction = "" async def _get_prediction(self, model: str): variables = copy.deepcopy(self.vars) @@ -76,22 +103,19 @@ async def _get_prediction(self, model: str): environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_update_changelog_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_update_changelog_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, - system=system_prompt, user=user_prompt) + response, finish_reason = await self.ai_handler.chat_completion( + model=model, system=system_prompt, user=user_prompt, temperature=get_settings().config.temperature) return response def _prepare_changelog_update(self) -> Tuple[str, str]: answer = self.prediction.strip().strip("```").strip() # noqa B005 if hasattr(self, "changelog_file"): - existing_content = self.changelog_file.decoded_content.decode() + existing_content = self.changelog_file else: existing_content = "" if existing_content: - new_file_content = answer + "\n\n" + self.changelog_file.decoded_content.decode() + new_file_content = answer + "\n\n" + self.changelog_file else: new_file_content = answer @@ -99,30 +123,30 @@ def _prepare_changelog_update(self) -> Tuple[str, str]: answer += "\n\n\n>to commit the new content to the CHANGELOG.md file, please type:" \ "\n>'/update_changelog --pr_update_changelog.push_changelog_changes=true'\n" - if get_settings().config.verbosity_level >= 2: - logging.info(f"answer:\n{answer}") - return new_file_content, answer def _push_changelog_update(self, new_file_content, answer): - self.git_provider.repo_obj.update_file(path=self.changelog_file.path, - message="Update CHANGELOG.md", - content=new_file_content, - sha=self.changelog_file.sha, - branch=self.git_provider.get_pr_branch()) - d = dict(body="CHANGELOG.md update", - path=self.changelog_file.path, - line=max(2, len(answer.splitlines())), - start_line=1) + self.git_provider.create_or_update_pr_file( + file_path="CHANGELOG.md", + branch=self.git_provider.get_pr_branch(), + contents=new_file_content, + message="[skip ci] Update CHANGELOG.md", + ) sleep(5) # wait for the file to be updated - last_commit_id = list(self.git_provider.pr.get_commits())[-1] try: - self.git_provider.pr.create_review(commit=last_commit_id, comments=[d]) + if get_settings().config.git_provider == "github": + last_commit_id = list(self.git_provider.pr.get_commits())[-1] + d = dict( + body="CHANGELOG.md update", + path="CHANGELOG.md", + line=max(2, len(answer.splitlines())), + start_line=1, + ) + self.git_provider.pr.create_review(commit=last_commit_id, comments=[d]) except Exception: # we can't create a review for some reason, let's just publish a comment - self.git_provider.publish_comment(f"**Changelog updates:**\n\n{answer}") - + self.git_provider.publish_comment(f"**Changelog updates: ๐Ÿ”„**\n\n{answer}") def _get_default_changelog(self): example_changelog = \ @@ -139,22 +163,17 @@ def _get_default_changelog(self): """ return example_changelog - def _get_changlog_file(self): + def _get_changelog_file(self): try: - self.changelog_file = self.git_provider.repo_obj.get_contents("CHANGELOG.md", - ref=self.git_provider.get_pr_branch()) - changelog_file_lines = self.changelog_file.decoded_content.decode().splitlines() + self.changelog_file = self.git_provider.get_pr_file_content( + "CHANGELOG.md", self.git_provider.get_pr_branch() + ) + changelog_file_lines = self.changelog_file.splitlines() changelog_file_lines = changelog_file_lines[:CHANGELOG_LINES] self.changelog_file_str = "\n".join(changelog_file_lines) except Exception: self.changelog_file_str = "" - if self.commit_changelog: - logging.info("No CHANGELOG.md file found in the repository. Creating one...") - changelog_file = self.git_provider.repo_obj.create_file(path="CHANGELOG.md", - message='add CHANGELOG.md', - content="", - branch=self.git_provider.get_pr_branch()) - self.changelog_file = changelog_file['content'] + self.changelog_file = "" if not self.changelog_file_str: self.changelog_file_str = self._get_default_changelog() diff --git a/pyproject.toml b/pyproject.toml index 2e8f2b5c2..86ce9a32a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,59 +3,44 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] -name = "pr_agent" -version = "0.0.1" +name = "pr-agent" +version = "0.2.4" + +authors = [{name= "CodiumAI", email = "tal.r@codium.ai"}] -authors = [ - {name = "Itamar Friedman", email = "itamar.f@codium.ai"}, -] maintainers = [ - {name = "Ori Kotek", email = "ori.k@codium.ai"}, {name = "Tal Ridnik", email = "tal.r@codium.ai"}, + {name = "Ori Kotek", email = "ori.k@codium.ai"}, {name = "Hussam Lawen", email = "hussam.l@codium.ai"}, - {name = "Sagi Medina", email = "sagi.m@codium.ai"} ] -description = "CodiumAI PR-Agent is an open-source tool to automatically analyze a pull request and provide several types of feedback" + +description = "CodiumAI PR-Agent aims to help efficiently review and handle pull requests, by providing AI feedbacks and suggestions." readme = "README.md" -requires-python = ">=3.9" -keywords = ["ai", "tool", "developer", "review", "agent"] -license = {file = "LICENSE", name = "Apache 2.0 License"} +requires-python = ">=3.10" +keywords = ["AI", "Agents", "Pull Request", "Automation", "Code Review"] +license = {name = "Apache 2.0", file = "LICENSE"} + classifiers = [ - "Development Status :: 3 - Alpha", "Intended Audience :: Developers", - "Operating System :: Independent", "Programming Language :: Python :: 3", ] +dynamic = ["dependencies"] -dependencies = [ - "dynaconf==3.1.12", - "fastapi==0.99.0", - "PyGithub==1.59.*", - "retry==0.9.2", - "openai==0.27.8", - "Jinja2==3.1.2", - "tiktoken==0.4.0", - "uvicorn==0.22.0", - "python-gitlab==3.15.0", - "pytest~=7.4.0", - "aiohttp~=3.8.4", - "atlassian-python-api==3.39.0", - "GitPython~=3.1.32", - "starlette-context==0.3.6", - "litellm~=0.1.351", - "PyYAML==6.0" -] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.urls] "Homepage" = "https://github.com/Codium-ai/pr-agent" +"Documentation" = "https://pr-agent-docs.codium.ai/" [tool.setuptools] -include-package-data = false +include-package-data = true license-files = ["LICENSE"] [tool.setuptools.packages.find] where = ["."] -include = ["pr_agent"] +include = ["pr_agent*"] # include pr_agent and any sub-packages it finds under it. [project.scripts] pr-agent = "pr_agent.cli:run" diff --git a/requirements-dev.txt b/requirements-dev.txt index 70613be0c..1af82d005 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1 +1,3 @@ pytest==7.4.0 +poetry +twine \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4920029b8..05f13ad76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,44 @@ -aiohttp==3.8.5 -aiosignal==1.3.1 -async-timeout==4.0.3 -attrs==23.1.0 -certifi==2023.7.22 -charset-normalizer==3.2.0 -frozenlist==1.4.0 -idna==3.4 --e git+ssh://git@github.com/kryptogo/litellm.git@a176786de8c4338838671b62d341ff310d2773fb#egg=litellm -multidict==6.0.4 -openai==0.27.8 -python-dotenv==1.0.0 -regex==2023.8.8 -requests==2.31.0 -tiktoken==0.4.0 -tqdm==4.66.1 -urllib3==2.0.4 -yarl==1.9.2 +aiohttp==3.9.5 +anthropic[vertex]==0.21.3 +atlassian-python-api==3.41.4 +azure-devops==7.1.0b3 +azure-identity==1.15.0 +boto3==1.33.6 +dynaconf==3.2.4 +fastapi==0.111.0 +GitPython==3.1.41 +google-cloud-aiplatform==1.38.0 +google-cloud-storage==2.10.0 +Jinja2==3.1.2 +litellm==1.43.13 +loguru==0.7.2 +msrest==0.7.1 +openai==1.46.0 +pytest==7.4.0 +PyGithub==1.59.* +PyYAML==6.0.1 +python-gitlab==3.15.0 +retry==0.9.2 +starlette-context==0.3.6 +tiktoken==0.7.0 +ujson==5.8.0 +uvicorn==0.22.0 +tenacity==8.2.3 +gunicorn==22.0.0 +pytest-cov==5.0.0 +pydantic==2.8.2 +html2text==2024.2.26 +# help bot +langchain==0.3.0 +langchain-openai==0.2.0 +langchain-pinecone==0.2.0 +langchain-chroma==0.1.4 +chromadb==0.5.7 +# Uncomment the following lines to enable the 'similar issue' tool +# pinecone-client +# pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main +# lancedb==0.5.1 +# uncomment this to support language LangChainOpenAIHandler +# langchain==0.2.0 +# langchain-core==0.2.28 +# langchain-openai==0.1.20 diff --git a/tests/e2e_tests/e2e_utils.py b/tests/e2e_tests/e2e_utils.py new file mode 100644 index 000000000..5251d83e4 --- /dev/null +++ b/tests/e2e_tests/e2e_utils.py @@ -0,0 +1,35 @@ +FILE_PATH = "pr_agent/cli_pip.py" + +PR_HEADER_START_WITH = '### **User description**\nupdate cli_pip.py\n\n\n___\n\n### **PR Type**' +REVIEW_START_WITH = '## PR Reviewer Guide ๐Ÿ”\n\n<table>\n<tr><td>โฑ๏ธ <strong>Estimated effort to review</strong>:' +IMPROVE_START_WITH_REGEX_PATTERN = r'^## PR Code Suggestions โœจ\n\n<!-- [a-z0-9]+ -->\n\n<table><thead><tr><td>Category</td>' + +NUM_MINUTES = 5 + +NEW_FILE_CONTENT = """\ +from pr_agent import cli +from pr_agent.config_loader import get_settings + + +def main(): + # Fill in the following values + provider = "github" # GitHub provider + user_token = "..." # GitHub user token + openai_key = "ghs_afsdfasdfsdf" # Example OpenAI key + pr_url = "..." # PR URL, for example 'https://github.com/Codium-ai/pr-agent/pull/809' + command = "/improve" # Command to run (e.g. '/review', '/describe', 'improve', '/ask="What is the purpose of this PR?"') + + # Setting the configurations + get_settings().set("CONFIG.git_provider", provider) + get_settings().set("openai.key", openai_key) + get_settings().set("github.user_token", user_token) + + # Run the command. Feedback will appear in GitHub PR comments + output = cli.run_command(pr_url, command) + + print(output) + +if __name__ == '__main__': + main() +""" + diff --git a/tests/e2e_tests/test_bitbucket_app.py b/tests/e2e_tests/test_bitbucket_app.py new file mode 100644 index 000000000..47b9ca4f1 --- /dev/null +++ b/tests/e2e_tests/test_bitbucket_app.py @@ -0,0 +1,100 @@ +import hashlib +import os +import re +import time +from datetime import datetime + +import jwt +from atlassian.bitbucket import Cloud + +import requests +from requests.auth import HTTPBasicAuth + +from pr_agent.config_loader import get_settings +from pr_agent.log import setup_logger, get_logger +from tests.e2e_tests.e2e_utils import NEW_FILE_CONTENT, FILE_PATH, PR_HEADER_START_WITH, REVIEW_START_WITH, \ + IMPROVE_START_WITH_REGEX_PATTERN, NUM_MINUTES + + +log_level = os.environ.get("LOG_LEVEL", "INFO") +setup_logger(log_level) +logger = get_logger() + +def test_e2e_run_bitbucket_app(): + repo_slug = 'pr-agent-tests' + project_key = 'codiumai' + base_branch = "main" # or any base branch you want + new_branch = f"bitbucket_app_e2e_test-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" + get_settings().config.git_provider = "bitbucket" + + try: + # Add username and password for authentication + username = get_settings().get("BITBUCKET.USERNAME", None) + password = get_settings().get("BITBUCKET.PASSWORD", None) + s = requests.Session() + s.auth = (username, password) # Use HTTP Basic Auth + bitbucket_client = Cloud(session=s) + repo = bitbucket_client.workspaces.get(workspace=project_key).repositories.get(repo_slug) + + # Create a new branch from the base branch + logger.info(f"Creating a new branch {new_branch} from {base_branch}") + source_branch = repo.branches.get(base_branch) + target_repo = repo.branches.create(new_branch,source_branch.hash) + + # Update the file content + url = (f"https://api.bitbucket.org/2.0/repositories/{project_key}/{repo_slug}/src") + files={FILE_PATH: NEW_FILE_CONTENT} + data={ + "message": "update cli_pip.py", + "branch": new_branch, + } + requests.request("POST", url, auth=HTTPBasicAuth(username, password), data=data, files=files) + + + # Create a pull request + logger.info(f"Creating a pull request from {new_branch} to {base_branch}") + pr = repo.pullrequests.create( + title=f'{new_branch}', + description="update cli_pip.py", + source_branch=new_branch, + destination_branch=base_branch + ) + + # check every 1 minute, for 5 minutes if the PR has all the tool results + for i in range(NUM_MINUTES): + logger.info(f"Waiting for the PR to get all the tool results...") + time.sleep(60) + comments = list(pr.comments()) + comments_raw = [c.raw for c in comments] + if len(comments) >= 5: # header, 3 suggestions, 1 review + valid_review = False + for comment_raw in comments_raw: + if comment_raw.startswith('## PR Reviewer Guide ๐Ÿ”'): + valid_review = True + break + if valid_review: + break + else: + logger.error(f"REVIEW feedback is invalid") + raise Exception("REVIEW feedback is invalid") + else: + logger.info(f"Waiting for the PR to get all the tool results. {i + 1} minute(s) passed") + else: + assert False, f"After {NUM_MINUTES} minutes, the PR did not get all the tool results" + + # cleanup - delete the branch + pr.decline() + repo.branches.delete(new_branch) + + # If we reach here, the test is successful + logger.info(f"Succeeded in running e2e test for Bitbucket app on the PR") + except Exception as e: + logger.error(f"Failed to run e2e test for Bitbucket app: {e}") + # delete the branch + pr.decline() + repo.branches.delete(new_branch) + assert False + + +if __name__ == '__main__': + test_e2e_run_bitbucket_app() diff --git a/tests/e2e_tests/test_github_app.py b/tests/e2e_tests/test_github_app.py new file mode 100644 index 000000000..6ecb1da8d --- /dev/null +++ b/tests/e2e_tests/test_github_app.py @@ -0,0 +1,96 @@ +import os +import re +import time +from datetime import datetime + +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.log import setup_logger, get_logger +from tests.e2e_tests.e2e_utils import NEW_FILE_CONTENT, FILE_PATH, PR_HEADER_START_WITH, REVIEW_START_WITH, \ + IMPROVE_START_WITH_REGEX_PATTERN, NUM_MINUTES + +log_level = os.environ.get("LOG_LEVEL", "INFO") +setup_logger(log_level) +logger = get_logger() + + +def test_e2e_run_github_app(): + """ + What we want to do: + (1) open a PR in a repo 'https://github.com/Codium-ai/pr-agent-tests' + (2) wait for 5 minutes until the PR is processed by the GitHub app + (3) check that the relevant tools have been executed + """ + base_branch = "main" # or any base branch you want + new_branch = f"github_app_e2e_test-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" + repo_url = 'Codium-ai/pr-agent-tests' + get_settings().config.git_provider = "github" + git_provider = get_git_provider()() + github_client = git_provider.github_client + repo = github_client.get_repo(repo_url) + + try: + # Create a new branch from the base branch + source = repo.get_branch(base_branch) + logger.info(f"Creating a new branch {new_branch} from {base_branch}") + repo.create_git_ref(ref=f"refs/heads/{new_branch}", sha=source.commit.sha) + + # Get the file you want to edit + file = repo.get_contents(FILE_PATH, ref=base_branch) + # content = file.decoded_content.decode() + + # Update the file content + logger.info(f"Updating the file {FILE_PATH}") + commit_message = "update cli_pip.py" + repo.update_file( + file.path, + commit_message, + NEW_FILE_CONTENT, + file.sha, + branch=new_branch + ) + + # Create a pull request + logger.info(f"Creating a pull request from {new_branch} to {base_branch}") + pr = repo.create_pull( + title=new_branch, + body="update cli_pip.py", + head=new_branch, + base=base_branch + ) + + # check every 1 minute, for 5, minutes if the PR has all the tool results + for i in range(NUM_MINUTES): + logger.info(f"Waiting for the PR to get all the tool results...") + time.sleep(60) + logger.info(f"Checking the PR {pr.html_url} after {i + 1} minute(s)") + pr.update() + pr_header_body = pr.body + comments = list(pr.get_issue_comments()) + if len(comments) == 2: + comments_body = [comment.body for comment in comments] + assert pr_header_body.startswith(PR_HEADER_START_WITH), "DESCRIBE feedback is invalid" + assert comments_body[0].startswith(REVIEW_START_WITH), "REVIEW feedback is invalid" + assert re.match(IMPROVE_START_WITH_REGEX_PATTERN, comments_body[1]), "IMPROVE feedback is invalid" + break + else: + logger.info(f"Waiting for the PR to get all the tool results. {i + 1} minute(s) passed") + else: + assert False, f"After {NUM_MINUTES} minutes, the PR did not get all the tool results" + + # cleanup - delete the branch + logger.info(f"Deleting the branch {new_branch}") + repo.get_git_ref(f"heads/{new_branch}").delete() + + # If we reach here, the test is successful + logger.info(f"Succeeded in running e2e test for GitHub app on the PR {pr.html_url}") + except Exception as e: + logger.error(f"Failed to run e2e test for GitHub app: {e}") + # delete the branch + logger.info(f"Deleting the branch {new_branch}") + repo.get_git_ref(f"heads/{new_branch}").delete() + assert False + + +if __name__ == '__main__': + test_e2e_run_github_app() diff --git a/tests/e2e_tests/test_gitlab_webhook.py b/tests/e2e_tests/test_gitlab_webhook.py new file mode 100644 index 000000000..053bb0dc2 --- /dev/null +++ b/tests/e2e_tests/test_gitlab_webhook.py @@ -0,0 +1,91 @@ +import os +import re +import time +from datetime import datetime + +import gitlab + +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.log import setup_logger, get_logger +from tests.e2e_tests.e2e_utils import NEW_FILE_CONTENT, FILE_PATH, PR_HEADER_START_WITH, REVIEW_START_WITH, \ + IMPROVE_START_WITH_REGEX_PATTERN, NUM_MINUTES + +log_level = os.environ.get("LOG_LEVEL", "INFO") +setup_logger(log_level) +logger = get_logger() + +def test_e2e_run_github_app(): + # GitLab setup + GITLAB_URL = "https://gitlab.com" + GITLAB_TOKEN = get_settings().gitlab.PERSONAL_ACCESS_TOKEN + gl = gitlab.Gitlab(GITLAB_URL, private_token=GITLAB_TOKEN) + repo_url = 'codiumai/pr-agent-tests' + project = gl.projects.get(repo_url) + + base_branch = "main" # or any base branch you want + new_branch = f"github_app_e2e_test-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" + + try: + # Create a new branch from the base branch + logger.info(f"Creating a new branch {new_branch} from {base_branch}") + project.branches.create({'branch': new_branch, 'ref': base_branch}) + + # Get the file you want to edit + file = project.files.get(file_path=FILE_PATH, ref=base_branch) + # content = file.decode() + + # Update the file content + logger.info(f"Updating the file {FILE_PATH}") + commit_message = "update cli_pip.py" + file.content = NEW_FILE_CONTENT + file.save(branch=new_branch, commit_message=commit_message) + + # Create a merge request + logger.info(f"Creating a merge request from {new_branch} to {base_branch}") + mr = project.mergerequests.create({ + 'source_branch': new_branch, + 'target_branch': base_branch, + 'title': new_branch, + 'description': "update cli_pip.py" + }) + logger.info(f"Merge request created: {mr.web_url}") + + # check every 1 minute, for 5, minutes if the PR has all the tool results + for i in range(NUM_MINUTES): + logger.info(f"Waiting for the MR to get all the tool results...") + time.sleep(60) + logger.info(f"Checking the MR {mr.web_url} after {i + 1} minute(s)") + mr = project.mergerequests.get(mr.iid) + mr_header_body = mr.description + comments = mr.notes.list()[::-1] + # clean all system comments + comments = [comment for comment in comments if comment.system is False] + if len(comments) == 2: # "changed the description" is received as the first comment + comments_body = [comment.body for comment in comments] + if 'Work in progress' in comments_body[1]: + continue + assert mr_header_body.startswith(PR_HEADER_START_WITH), "DESCRIBE feedback is invalid" + assert comments_body[0].startswith(REVIEW_START_WITH), "REVIEW feedback is invalid" + assert re.match(IMPROVE_START_WITH_REGEX_PATTERN, comments_body[1]), "IMPROVE feedback is invalid" + break + else: + logger.info(f"Waiting for the MR to get all the tool results. {i + 1} minute(s) passed") + else: + assert False, f"After {NUM_MINUTES} minutes, the MR did not get all the tool results" + + # cleanup - delete the branch + logger.info(f"Deleting the branch {new_branch}") + project.branches.delete(new_branch) + + # If we reach here, the test is successful + logger.info(f"Succeeded in running e2e test for GitLab app on the MR {mr.web_url}") + except Exception as e: + logger.error(f"Failed to run e2e test for GitHub app: {e}") + logger.info(f"Deleting the branch {new_branch}") + project.branches.delete(new_branch) + assert False + + +if __name__ == '__main__': + test_e2e_run_github_app() diff --git a/tests/unittest/test_azure_devops_parsing.py b/tests/unittest/test_azure_devops_parsing.py new file mode 100644 index 000000000..84c0ad08c --- /dev/null +++ b/tests/unittest/test_azure_devops_parsing.py @@ -0,0 +1,15 @@ +from pr_agent.git_providers import AzureDevopsProvider + + +class TestAzureDevOpsParsing(): + def test_regular_address(self): + pr_url = "https://dev.azure.com/organization/project/_git/repo/pullrequest/1" + + # workspace_slug, repo_slug, pr_number + assert AzureDevopsProvider._parse_pr_url(pr_url) == ("project", "repo", 1) + + def test_visualstudio_address(self): + pr_url = "https://organization.visualstudio.com/project/_git/repo/pullrequest/1" + + # workspace_slug, repo_slug, pr_number + assert AzureDevopsProvider._parse_pr_url(pr_url) == ("project", "repo", 1) \ No newline at end of file diff --git a/tests/unittest/test_bitbucket_provider.py b/tests/unittest/test_bitbucket_provider.py new file mode 100644 index 000000000..1ea99931c --- /dev/null +++ b/tests/unittest/test_bitbucket_provider.py @@ -0,0 +1,297 @@ +from pr_agent.git_providers import BitbucketServerProvider +from pr_agent.git_providers.bitbucket_provider import BitbucketProvider +from unittest.mock import MagicMock +from atlassian.bitbucket import Bitbucket +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo + + +class TestBitbucketProvider: + def test_parse_pr_url(self): + url = "https://bitbucket.org/WORKSPACE_XYZ/MY_TEST_REPO/pull-requests/321" + workspace_slug, repo_slug, pr_number = BitbucketProvider._parse_pr_url(url) + assert workspace_slug == "WORKSPACE_XYZ" + assert repo_slug == "MY_TEST_REPO" + assert pr_number == 321 + + +class TestBitbucketServerProvider: + def test_parse_pr_url(self): + url = "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1" + workspace_slug, repo_slug, pr_number = BitbucketServerProvider._parse_pr_url(url) + assert workspace_slug == "AAA" + assert repo_slug == "my-repo" + assert pr_number == 1 + + def mock_get_content_of_file(self, project_key, repository_slug, filename, at=None, markup=None): + content_map = { + '9c1cffdd9f276074bfb6fb3b70fbee62d298b058': 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile\n', + '2a1165446bdf991caf114d01f7c88d84ae7399cf': 'file\nwith\nmultiple \nlines\nto\nemulate\na\nfake\nfile\n', + 'f617708826cdd0b40abb5245eda71630192a17e3': 'file\nwith\nmultiple \nlines\nto\nemulate\na\nreal\nfile\n', + 'cb68a3027d6dda065a7692ebf2c90bed1bcdec28': 'file\nwith\nsome\nchanges\nto\nemulate\na\nreal\nfile\n', + '1905dcf16c0aac6ac24f7ab617ad09c73dc1d23b': 'file\nwith\nsome\nlines\nto\nemulate\na\nfake\ntest\n', + 'ae4eca7f222c96d396927d48ab7538e2ee13ca63': 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', + '548f8ba15abc30875a082156314426806c3f4d97': 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile', + '0e898cb355a5170d8c8771b25d43fcaa1d2d9489': 'file\nwith\nmultiple\nlines\nto\nemulate\na\nreal\nfile' + } + return content_map.get(at, '') + + def mock_get_from_bitbucket_60(self, url): + response_map = { + "rest/api/1.0/application-properties": { + "version": "6.0" + } + } + return response_map.get(url, '') + + def mock_get_from_bitbucket_70(self, url): + response_map = { + "rest/api/1.0/application-properties": { + "version": "7.0" + } + } + return response_map.get(url, '') + + def mock_get_from_bitbucket_816(self, url): + response_map = { + "rest/api/1.0/application-properties": { + "version": "8.16" + }, + "rest/api/latest/projects/AAA/repos/my-repo/pull-requests/1/merge-base": { + 'id': '548f8ba15abc30875a082156314426806c3f4d97' + } + } + return response_map.get(url, '') + + + ''' + tests the 2-way diff functionality where the diff should be between the HEAD of branch b and node c + NOT between the HEAD of main and the HEAD of branch b + + - o branch b + / + o - o - o main + ^ node c + ''' + def test_get_diff_files_simple_diverge_70(self): + bitbucket_client = MagicMock(Bitbucket) + bitbucket_client.get_pull_request.return_value = { + 'toRef': {'latestCommit': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, + 'fromRef': {'latestCommit': '2a1165446bdf991caf114d01f7c88d84ae7399cf'} + } + bitbucket_client.get_pull_requests_commits.return_value = [ + {'id': '2a1165446bdf991caf114d01f7c88d84ae7399cf', + 'parents': [{'id': 'f617708826cdd0b40abb5245eda71630192a17e3'}]} + ] + bitbucket_client.get_commits.return_value = [ + {'id': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, + {'id': 'dbca09554567d2e4bee7f07993390153280ee450'} + ] + bitbucket_client.get_pull_requests_changes.return_value = [ + { + 'path': {'toString': 'Readme.md'}, + 'type': 'MODIFY', + } + ] + + bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_70 + bitbucket_client.get_content_of_file.side_effect = self.mock_get_content_of_file + + provider = BitbucketServerProvider( + "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", + bitbucket_client=bitbucket_client + ) + + expected = [ + FilePatchInfo( + 'file\nwith\nmultiple \nlines\nto\nemulate\na\nreal\nfile\n', + 'file\nwith\nmultiple \nlines\nto\nemulate\na\nfake\nfile\n', + '--- \n+++ \n@@ -5,5 +5,5 @@\n to\n emulate\n a\n-real\n+fake\n file\n', + 'Readme.md', + edit_type=EDIT_TYPE.MODIFIED, + ) + ] + + actual = provider.get_diff_files() + + assert actual == expected + + + ''' + tests the 2-way diff functionality where the diff should be between the HEAD of branch b and node c + NOT between the HEAD of main and the HEAD of branch b + + - o - o - o branch b + / / + o - o -- o - o main + ^ node c + ''' + def test_get_diff_files_diverge_with_merge_commit_70(self): + bitbucket_client = MagicMock(Bitbucket) + bitbucket_client.get_pull_request.return_value = { + 'toRef': {'latestCommit': 'cb68a3027d6dda065a7692ebf2c90bed1bcdec28'}, + 'fromRef': {'latestCommit': '1905dcf16c0aac6ac24f7ab617ad09c73dc1d23b'} + } + bitbucket_client.get_pull_requests_commits.return_value = [ + {'id': '1905dcf16c0aac6ac24f7ab617ad09c73dc1d23b', + 'parents': [{'id': '692772f456c3db77a90b11ce39ea516f8c2bad93'}]}, + {'id': '692772f456c3db77a90b11ce39ea516f8c2bad93', 'parents': [ + {'id': '2a1165446bdf991caf114d01f7c88d84ae7399cf'}, + {'id': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, + ]}, + {'id': '2a1165446bdf991caf114d01f7c88d84ae7399cf', + 'parents': [{'id': 'f617708826cdd0b40abb5245eda71630192a17e3'}]} + ] + bitbucket_client.get_commits.return_value = [ + {'id': 'cb68a3027d6dda065a7692ebf2c90bed1bcdec28'}, + {'id': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, + {'id': 'dbca09554567d2e4bee7f07993390153280ee450'} + ] + bitbucket_client.get_pull_requests_changes.return_value = [ + { + 'path': {'toString': 'Readme.md'}, + 'type': 'MODIFY', + } + ] + + bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_70 + bitbucket_client.get_content_of_file.side_effect = self.mock_get_content_of_file + + provider = BitbucketServerProvider( + "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", + bitbucket_client=bitbucket_client + ) + + expected = [ + FilePatchInfo( + 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile\n', + 'file\nwith\nsome\nlines\nto\nemulate\na\nfake\ntest\n', + '--- \n+++ \n@@ -5,5 +5,5 @@\n to\n emulate\n a\n-real\n-file\n+fake\n+test\n', + 'Readme.md', + edit_type=EDIT_TYPE.MODIFIED, + ) + ] + + actual = provider.get_diff_files() + + assert actual == expected + + + ''' + tests the 2-way diff functionality where the diff should be between the HEAD of branch c and node d + NOT between the HEAD of main and the HEAD of branch c + + ---- o - o branch c + / / + ---- o branch b + / / + o - o - o main + ^ node d + ''' + def get_multi_merge_diverge_mock_client(self, api_version): + bitbucket_client = MagicMock(Bitbucket) + bitbucket_client.get_pull_request.return_value = { + 'toRef': {'latestCommit': '9569922b22fe4fd0968be6a50ed99f71efcd0504'}, + 'fromRef': {'latestCommit': 'ae4eca7f222c96d396927d48ab7538e2ee13ca63'} + } + bitbucket_client.get_pull_requests_commits.return_value = [ + {'id': 'ae4eca7f222c96d396927d48ab7538e2ee13ca63', + 'parents': [{'id': 'bbf300fb3af5129af8c44659f8cc7a526a6a6f31'}]}, + {'id': 'bbf300fb3af5129af8c44659f8cc7a526a6a6f31', 'parents': [ + {'id': '10b7b8e41cb370b48ceda8da4e7e6ad033182213'}, + {'id': 'd1bb183c706a3ebe4c2b1158c25878201a27ad8c'}, + ]}, + {'id': 'd1bb183c706a3ebe4c2b1158c25878201a27ad8c', 'parents': [ + {'id': '5bd76251866cb415fc5ff232f63a581e89223bda'}, + {'id': '548f8ba15abc30875a082156314426806c3f4d97'} + ]}, + {'id': '5bd76251866cb415fc5ff232f63a581e89223bda', + 'parents': [{'id': '0e898cb355a5170d8c8771b25d43fcaa1d2d9489'}]}, + {'id': '10b7b8e41cb370b48ceda8da4e7e6ad033182213', + 'parents': [{'id': '0e898cb355a5170d8c8771b25d43fcaa1d2d9489'}]} + ] + bitbucket_client.get_commits.return_value = [ + {'id': '9569922b22fe4fd0968be6a50ed99f71efcd0504'}, + {'id': '548f8ba15abc30875a082156314426806c3f4d97'} + ] + bitbucket_client.get_pull_requests_changes.return_value = [ + { + 'path': {'toString': 'Readme.md'}, + 'type': 'MODIFY', + } + ] + + bitbucket_client.get_content_of_file.side_effect = self.mock_get_content_of_file + if api_version == 60: + bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_60 + elif api_version == 70: + bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_70 + elif api_version == 816: + bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_816 + + return bitbucket_client + + def test_get_diff_files_multi_merge_diverge_60(self): + bitbucket_client = self.get_multi_merge_diverge_mock_client(60) + + provider = BitbucketServerProvider( + "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", + bitbucket_client=bitbucket_client + ) + + expected = [ + FilePatchInfo( + 'file\nwith\nmultiple\nlines\nto\nemulate\na\nreal\nfile', + 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', + '--- \n+++ \n@@ -1,9 +1,9 @@\n-file\n-with\n-multiple\n+readme\n+without\n+some\n lines\n to\n-emulate\n+simulate\n a\n real\n file', + 'Readme.md', + edit_type=EDIT_TYPE.MODIFIED, + ) + ] + + actual = provider.get_diff_files() + + assert actual == expected + + def test_get_diff_files_multi_merge_diverge_70(self): + bitbucket_client = self.get_multi_merge_diverge_mock_client(70) + + provider = BitbucketServerProvider( + "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", + bitbucket_client=bitbucket_client + ) + + expected = [ + FilePatchInfo( + 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile', + 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', + '--- \n+++ \n@@ -1,9 +1,9 @@\n-file\n-with\n+readme\n+without\n some\n lines\n to\n-emulate\n+simulate\n a\n real\n file', + 'Readme.md', + edit_type=EDIT_TYPE.MODIFIED, + ) + ] + + actual = provider.get_diff_files() + + assert actual == expected + + def test_get_diff_files_multi_merge_diverge_816(self): + bitbucket_client = self.get_multi_merge_diverge_mock_client(816) + + provider = BitbucketServerProvider( + "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", + bitbucket_client=bitbucket_client + ) + + expected = [ + FilePatchInfo( + 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile', + 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', + '--- \n+++ \n@@ -1,9 +1,9 @@\n-file\n-with\n+readme\n+without\n some\n lines\n to\n-emulate\n+simulate\n a\n real\n file', + 'Readme.md', + edit_type=EDIT_TYPE.MODIFIED, + ) + ] + + actual = provider.get_diff_files() + + assert actual == expected \ No newline at end of file diff --git a/tests/unittest/test_clip_tokens.py b/tests/unittest/test_clip_tokens.py new file mode 100644 index 000000000..79de62940 --- /dev/null +++ b/tests/unittest/test_clip_tokens.py @@ -0,0 +1,19 @@ + +# Generated by CodiumAI + +import pytest + +from pr_agent.algo.utils import clip_tokens + + +class TestClipTokens: + def test_clip(self): + text = "line1\nline2\nline3\nline4\nline5\nline6" + max_tokens = 25 + result = clip_tokens(text, max_tokens) + assert result == text + + max_tokens = 10 + result = clip_tokens(text, max_tokens) + expected_results = 'line1\nline2\nline3\n\n...(truncated)' + assert result == expected_results diff --git a/tests/unittest/test_codecommit_client.py b/tests/unittest/test_codecommit_client.py new file mode 100644 index 000000000..a81e4b326 --- /dev/null +++ b/tests/unittest/test_codecommit_client.py @@ -0,0 +1,136 @@ +from unittest.mock import MagicMock +from pr_agent.git_providers.codecommit_client import CodeCommitClient + + +class TestCodeCommitProvider: + def test_get_differences(self): + # Create a mock CodeCommitClient instance and codecommit_client member + api = CodeCommitClient() + api.boto_client = MagicMock() + + # Mock the response from the AWS client for get_differences method + api.boto_client.get_paginator.return_value.paginate.return_value = [ + { + "differences": [ + { + "beforeBlob": { + "path": "file1.py", + "blobId": "291b15c3ab4219e43a5f4f9091e5a97ee9d7400b", + }, + "afterBlob": { + "path": "file1.py", + "blobId": "46ad86582da03cc34c804c24b17976571bca1eba", + }, + "changeType": "M", + }, + { + "beforeBlob": {"path": "", "blobId": ""}, + "afterBlob": { + "path": "file2.py", + "blobId": "2404c7874fcbd684d6779c1420072f088647fd79", + }, + "changeType": "A", + }, + { + "beforeBlob": { + "path": "file3.py", + "blobId": "9af7989045ce40e9478ebb8089dfbadac19a9cde", + }, + "afterBlob": {"path": "", "blobId": ""}, + "changeType": "D", + }, + { + "beforeBlob": { + "path": "file5.py", + "blobId": "738e36eec120ef9d6393a149252698f49156d5b4", + }, + "afterBlob": { + "path": "file6.py", + "blobId": "faecdb85f7ba199df927a783b261378a1baeca85", + }, + "changeType": "R", + }, + ] + } + ] + + diffs = api.get_differences("my_test_repo", "commit1", "commit2") + + assert len(diffs) == 4 + assert diffs[0].before_blob_path == "file1.py" + assert diffs[0].before_blob_id == "291b15c3ab4219e43a5f4f9091e5a97ee9d7400b" + assert diffs[0].after_blob_path == "file1.py" + assert diffs[0].after_blob_id == "46ad86582da03cc34c804c24b17976571bca1eba" + assert diffs[0].change_type == "M" + assert diffs[1].before_blob_path == "" + assert diffs[1].before_blob_id == "" + assert diffs[1].after_blob_path == "file2.py" + assert diffs[1].after_blob_id == "2404c7874fcbd684d6779c1420072f088647fd79" + assert diffs[1].change_type == "A" + assert diffs[2].before_blob_path == "file3.py" + assert diffs[2].before_blob_id == "9af7989045ce40e9478ebb8089dfbadac19a9cde" + assert diffs[2].after_blob_path == "" + assert diffs[2].after_blob_id == "" + assert diffs[2].change_type == "D" + assert diffs[3].before_blob_path == "file5.py" + assert diffs[3].before_blob_id == "738e36eec120ef9d6393a149252698f49156d5b4" + assert diffs[3].after_blob_path == "file6.py" + assert diffs[3].after_blob_id == "faecdb85f7ba199df927a783b261378a1baeca85" + assert diffs[3].change_type == "R" + + def test_get_file(self): + # Create a mock CodeCommitClient instance and codecommit_client member + api = CodeCommitClient() + api.boto_client = MagicMock() + + # Mock the response from the AWS client for get_pull_request method + # def get_file(self, repo_name: str, file_path: str, sha_hash: str): + api.boto_client.get_file.return_value = { + "commitId": "6335d6d4496e8d50af559560997604bb03abc122", + "blobId": "c172209495d7968a8fdad76469564fb708460bc1", + "filePath": "requirements.txt", + "fileSize": 65, + "fileContent": b"boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n", + } + + repo_name = "my_test_repo" + file_path = "requirements.txt" + sha_hash = "84114a356ece1e5b7637213c8e486fea7c254656" + content = api.get_file(repo_name, file_path, sha_hash) + + assert len(content) == 65 + assert content == b"boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n" + assert content.decode("utf-8") == "boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n" + + def test_get_pr(self): + # Create a mock CodeCommitClient instance and codecommit_client member + api = CodeCommitClient() + api.boto_client = MagicMock() + + # Mock the response from the AWS client for get_pull_request method + api.boto_client.get_pull_request.return_value = { + "pullRequest": { + "pullRequestId": "321", + "title": "My PR", + "description": "My PR description", + "pullRequestTargets": [ + { + "sourceCommit": "commit1", + "sourceReference": "branch1", + "destinationCommit": "commit2", + "destinationReference": "branch2", + "repositoryName": "my_test_repo", + } + ], + } + } + + pr = api.get_pr("my_test_repo", 321) + + assert pr.title == "My PR" + assert pr.description == "My PR description" + assert len(pr.targets) == 1 + assert pr.targets[0].source_commit == "commit1" + assert pr.targets[0].source_branch == "branch1" + assert pr.targets[0].destination_commit == "commit2" + assert pr.targets[0].destination_branch == "branch2" diff --git a/tests/unittest/test_codecommit_provider.py b/tests/unittest/test_codecommit_provider.py new file mode 100644 index 000000000..56312d732 --- /dev/null +++ b/tests/unittest/test_codecommit_provider.py @@ -0,0 +1,189 @@ +import pytest +from unittest.mock import patch +from pr_agent.git_providers.codecommit_provider import CodeCommitFile +from pr_agent.git_providers.codecommit_provider import CodeCommitProvider +from pr_agent.git_providers.codecommit_provider import PullRequestCCMimic +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo + + +class TestCodeCommitFile: + # Test that a CodeCommitFile object is created successfully with valid parameters. + # Generated by CodiumAI + def test_valid_parameters(self): + a_path = "path/to/file_a" + a_blob_id = "12345" + b_path = "path/to/file_b" + b_blob_id = "67890" + edit_type = EDIT_TYPE.ADDED + + file = CodeCommitFile(a_path, a_blob_id, b_path, b_blob_id, edit_type) + + assert file.a_path == a_path + assert file.a_blob_id == a_blob_id + assert file.b_path == b_path + assert file.b_blob_id == b_blob_id + assert file.edit_type == edit_type + assert file.filename == b_path + + +class TestCodeCommitProvider: + def test_get_title(self): + # Test that the get_title() function returns the PR title + with patch.object(CodeCommitProvider, "__init__", lambda x, y: None): + provider = CodeCommitProvider(None) + provider.pr = PullRequestCCMimic("My Test PR Title", []) + assert provider.get_title() == "My Test PR Title" + + def test_get_pr_id(self): + # Test that the get_pr_id() function returns the correct ID + with patch.object(CodeCommitProvider, "__init__", lambda x, y: None): + provider = CodeCommitProvider(None) + provider.repo_name = "my_test_repo" + provider.pr_num = 321 + assert provider.get_pr_id() == "my_test_repo/321" + + def test_parse_pr_url(self): + # Test that the _parse_pr_url() function can extract the repo name and PR number from a CodeCommit URL + url = "https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/my_test_repo/pull-requests/321" + repo_name, pr_number = CodeCommitProvider._parse_pr_url(url) + assert repo_name == "my_test_repo" + assert pr_number == 321 + + def test_is_valid_codecommit_hostname(self): + # Test the various AWS regions + assert CodeCommitProvider._is_valid_codecommit_hostname("af-south-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-east-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-northeast-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-northeast-2.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-northeast-3.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-south-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-south-2.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-2.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-3.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-4.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("ca-central-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-central-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-central-2.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-north-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-south-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-south-2.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-west-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-west-2.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("eu-west-3.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("il-central-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("me-central-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("me-south-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("sa-east-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("us-east-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("us-east-2.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("us-gov-east-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("us-gov-west-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("us-west-1.console.aws.amazon.com") + assert CodeCommitProvider._is_valid_codecommit_hostname("us-west-2.console.aws.amazon.com") + # Test non-AWS regions + assert not CodeCommitProvider._is_valid_codecommit_hostname("no-such-region.console.aws.amazon.com") + assert not CodeCommitProvider._is_valid_codecommit_hostname("console.aws.amazon.com") + + # Test that an error is raised when an invalid CodeCommit URL is provided to the set_pr() method of the CodeCommitProvider class. + # Generated by CodiumAI + def test_invalid_codecommit_url(self): + provider = CodeCommitProvider() + with pytest.raises(ValueError): + provider.set_pr("https://example.com/codecommit/repositories/my_test_repo/pull-requests/4321") + + def test_get_file_extensions(self): + filenames = [ + "app.py", + "cli.py", + "composer.json", + "composer.lock", + "hello.py", + "image1.jpg", + "image2.JPG", + "index.js", + "provider.py", + "README", + "test.py", + ] + expected_extensions = [ + ".py", + ".py", + ".json", + ".lock", + ".py", + ".jpg", + ".jpg", + ".js", + ".py", + "", + ".py", + ] + extensions = CodeCommitProvider._get_file_extensions(filenames) + assert extensions == expected_extensions + + def test_get_language_percentages(self): + extensions = [ + ".py", + ".py", + ".json", + ".lock", + ".py", + ".jpg", + ".jpg", + ".js", + ".py", + "", + ".py", + ] + percentages = CodeCommitProvider._get_language_percentages(extensions) + assert percentages[".py"] == 45 + assert percentages[".json"] == 9 + assert percentages[".lock"] == 9 + assert percentages[".jpg"] == 18 + assert percentages[".js"] == 9 + assert percentages[""] == 9 + + # The _get_file_extensions function needs the "." prefix on the extension, + # but the _get_language_percentages function will work with or without the "." prefix + extensions = [ + "txt", + "py", + "py", + ] + percentages = CodeCommitProvider._get_language_percentages(extensions) + assert percentages["py"] == 67 + assert percentages["txt"] == 33 + + # test an empty list + percentages = CodeCommitProvider._get_language_percentages([]) + assert percentages == {} + + def test_get_edit_type(self): + # Test that the _get_edit_type() function can convert a CodeCommit letter to an EDIT_TYPE enum + assert CodeCommitProvider._get_edit_type("A") == EDIT_TYPE.ADDED + assert CodeCommitProvider._get_edit_type("D") == EDIT_TYPE.DELETED + assert CodeCommitProvider._get_edit_type("M") == EDIT_TYPE.MODIFIED + assert CodeCommitProvider._get_edit_type("R") == EDIT_TYPE.RENAMED + + assert CodeCommitProvider._get_edit_type("a") == EDIT_TYPE.ADDED + assert CodeCommitProvider._get_edit_type("d") == EDIT_TYPE.DELETED + assert CodeCommitProvider._get_edit_type("m") == EDIT_TYPE.MODIFIED + assert CodeCommitProvider._get_edit_type("r") == EDIT_TYPE.RENAMED + + assert CodeCommitProvider._get_edit_type("X") is None + + def test_add_additional_newlines(self): + # a short string to test adding double newlines + input = "abc\ndef\n\n___\nghi\njkl\nmno\n\npqr\n" + expect = "abc\n\ndef\n\n___\n\nghi\n\njkl\n\nmno\n\npqr\n\n" + assert CodeCommitProvider._add_additional_newlines(input) == expect + # a test example from a real PR + input = "## PR Type:\nEnhancement\n\n___\n## PR Description:\nThis PR introduces a new feature to the script, allowing users to filter servers by name.\n\n___\n## PR Main Files Walkthrough:\n`foo`: The foo script has been updated to include a new command line option `-f` or `--filter`.\n`bar`: The bar script has been updated to list stopped servers.\n" + expect = "## PR Type:\n\nEnhancement\n\n___\n\n## PR Description:\n\nThis PR introduces a new feature to the script, allowing users to filter servers by name.\n\n___\n\n## PR Main Files Walkthrough:\n\n`foo`: The foo script has been updated to include a new command line option `-f` or `--filter`.\n\n`bar`: The bar script has been updated to list stopped servers.\n\n" + assert CodeCommitProvider._add_additional_newlines(input) == expect + + def test_remove_markdown_html(self): + input = "## PR Feedback\n<details><summary>Code feedback:</summary>\nfile foo\n</summary>\n" + expect = "## PR Feedback\nCode feedback:\nfile foo\n\n" + assert CodeCommitProvider._remove_markdown_html(input) == expect diff --git a/tests/unittest/test_convert_to_markdown.py b/tests/unittest/test_convert_to_markdown.py index 4463513f7..5d9142030 100644 --- a/tests/unittest/test_convert_to_markdown.py +++ b/tests/unittest/test_convert_to_markdown.py @@ -1,5 +1,6 @@ # Generated by CodiumAI -from pr_agent.algo.utils import convert_to_markdown +from pr_agent.algo.utils import PRReviewHeader, convert_to_markdown_v2 +from pr_agent.tools.pr_description import insert_br_after_x_chars """ Code Analysis @@ -44,74 +45,63 @@ class TestConvertToMarkdown: # Tests that the function works correctly with a simple dictionary input def test_simple_dictionary_input(self): - input_data = { - 'Main theme': 'Test', - 'Type of PR': 'Test type', - 'Relevant tests added': 'no', - 'Unrelated changes': 'n/a', # won't be included in the output - 'Focused PR': 'Yes', - 'General PR suggestions': 'general suggestion...', - 'Code feedback': [ - { - 'Code example': { - 'Before': 'Code before', - 'After': 'Code after' - } - }, - { - 'Code example': { - 'Before': 'Code before 2', - 'After': 'Code after 2' - } - } - ] - } - expected_output = """\ -- ๐ŸŽฏ **Main theme:** Test -- ๐Ÿ“Œ **Type of PR:** Test type -- ๐Ÿงช **Relevant tests added:** no -- โœจ **Focused PR:** Yes -- ๐Ÿ’ก **General PR suggestions:** general suggestion... - -- ๐Ÿค– **Code feedback:** - - - **Code example:** - - **Before:** - ``` - Code before - ``` - - **After:** - ``` - Code after - ``` - - - **Code example:** - - **Before:** - ``` - Code before 2 - ``` - - **After:** - ``` - Code after 2 - ``` -""" - assert convert_to_markdown(input_data).strip() == expected_output.strip() + input_data = {'review': { + 'estimated_effort_to_review_[1-5]': '1, because the changes are minimal and straightforward, focusing on a single functionality addition.\n', + 'relevant_tests': 'No\n', 'possible_issues': 'No\n', 'security_concerns': 'No\n'}, 'code_feedback': [ + {'relevant_file': '``pr_agent/git_providers/git_provider.py\n``', 'language': 'python\n', + 'suggestion': "Consider raising an exception or logging a warning when 'pr_url' attribute is not found. This can help in debugging issues related to the absence of 'pr_url' in instances where it's expected. [important]\n", + 'relevant_line': '[return ""](https://github.com/Codium-ai/pr-agent-pro/pull/102/files#diff-52d45f12b836f77ed1aef86e972e65404634ea4e2a6083fb71a9b0f9bb9e062fR199)'}]} + + + expected_output = f'{PRReviewHeader.REGULAR.value} ๐Ÿ”\n\n<table>\n<tr><td>โฑ๏ธ <strong>Estimated effort to review</strong>: 1 ๐Ÿ”ตโšชโšชโšชโšช</td></tr>\n<tr><td>๐Ÿงช <strong>No relevant tests</strong></td></tr>\n<tr><td>โšก <strong>Possible issues</strong>: No\n</td></tr>\n<tr><td>๐Ÿ”’ <strong>No security concerns identified</strong></td></tr>\n</table>\n\n\n<details><summary> <strong>Code feedback:</strong></summary>\n\n<hr><table><tr><td>relevant file</td><td>pr_agent/git_providers/git_provider.py\n</td></tr><tr><td>suggestion      </td><td>\n\n<strong>\n\nConsider raising an exception or logging a warning when \'pr_url\' attribute is not found. This can help in debugging issues related to the absence of \'pr_url\' in instances where it\'s expected. [important]\n\n</strong>\n</td></tr><tr><td>relevant line</td><td><a href=\'https://github.com/Codium-ai/pr-agent-pro/pull/102/files#diff-52d45f12b836f77ed1aef86e972e65404634ea4e2a6083fb71a9b0f9bb9e062fR199\'>return ""</a></td></tr></table><hr>\n\n</details>' + + assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() # Tests that the function works correctly with an empty dictionary input def test_empty_dictionary_input(self): input_data = {} - expected_output = "" - assert convert_to_markdown(input_data).strip() == expected_output.strip() - - def test_dictionary_input_containing_only_empty_dictionaries(self): - input_data = { - 'Main theme': {}, - 'Type of PR': {}, - 'Relevant tests added': {}, - 'Unrelated changes': {}, - 'Focused PR': {}, - 'General PR suggestions': {}, - 'Code suggestions': {} - } - expected_output = "" - assert convert_to_markdown(input_data).strip() == expected_output.strip() + + expected_output = '' + + + assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() + + def test_dictionary_with_empty_dictionaries(self): + input_data = {'review': {}, 'code_feedback': [{}]} + + expected_output = '' + + + assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() + +class TestBR: + def test_br1(self): + file_change_description = '- Imported `FilePatchInfo` and `EDIT_TYPE` from `pr_agent.algo.types` instead of `pr_agent.git_providers.git_provider`.' + file_change_description_br = insert_br_after_x_chars(file_change_description) + expected_output = ('<li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from ' + '<code>pr_agent.algo.types</code> instead <br>of ' + '<code>pr_agent.git_providers.git_provider</code>.') + assert file_change_description_br == expected_output + # print("-----") + # print(file_change_description_br) + + def test_br2(self): + file_change_description = ( + '- Created a - new -class `ColorPaletteResourcesCollection ColorPaletteResourcesCollection ' + 'ColorPaletteResourcesCollection ColorPaletteResourcesCollection`') + file_change_description_br = insert_br_after_x_chars(file_change_description) + expected_output = ('<li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>' + 'ColorPaletteResourcesCollection ColorPaletteResourcesCollection ' + '</code><br><code>ColorPaletteResourcesCollection</code>') + assert file_change_description_br == expected_output + # print("-----") + # print(file_change_description_br) + + def test_br3(self): + file_change_description = 'Created a new class `ColorPaletteResourcesCollection` which extends `AvaloniaDictionary<ThemeVariant, ColorPaletteResources>` and implements aaa' + file_change_description_br = insert_br_after_x_chars(file_change_description) + assert file_change_description_br == ('Created a new class <code>ColorPaletteResourcesCollection</code> which ' + 'extends <br><code>AvaloniaDictionary<ThemeVariant, ColorPaletteResources>' + '</code> and implements <br>aaa') + # print("-----") + # print(file_change_description_br) diff --git a/tests/unittest/test_extend_patch.py b/tests/unittest/test_extend_patch.py index ba0af881b..2d8913f39 100644 --- a/tests/unittest/test_extend_patch.py +++ b/tests/unittest/test_extend_patch.py @@ -1,54 +1,22 @@ - -# Generated by CodiumAI - - +import pytest from pr_agent.algo.git_patch_processing import extend_patch - -""" -Code Analysis - -Objective: -The objective of the 'extend_patch' function is to extend a given patch to include a specified number of surrounding -lines. This function takes in an original file string, a patch string, and the number of lines to extend the patch by, -and returns the extended patch string. - -Inputs: -- original_file_str: a string representing the original file -- patch_str: a string representing the patch to be extended -- num_lines: an integer representing the number of lines to extend the patch by - -Flow: -1. Split the original file string and patch string into separate lines -2. Initialize variables to keep track of the current hunk's start and size for both the original file and the patch -3. Iterate through each line in the patch string -4. If the line starts with '@@', extract the start and size values for both the original file and the patch, and -calculate the extended start and size values -5. Append the extended hunk header to the extended patch lines list -6. Append the specified number of lines before the hunk to the extended patch lines list -7. Append the current line to the extended patch lines list -8. If the line is not a hunk header, append it to the extended patch lines list -9. Return the extended patch string - -Outputs: -- extended_patch_str: a string representing the extended patch - -Additional aspects: -- The function uses regular expressions to extract the start and size values from the hunk header -- The function handles cases where the start value of a hunk is less than the number of lines to extend by by setting -the extended start value to 1 -- The function handles cases where the hunk extends beyond the end of the original file by only including lines up to -the end of the original file in the extended patch -""" +from pr_agent.algo.pr_processing import pr_generate_extended_diff +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.config_loader import get_settings class TestExtendPatch: + def setUp(self): + get_settings().config.allow_dynamic_context = False + # Tests that the function works correctly with valid input def test_happy_path(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' - patch_str = '@@ -2,2 +2,2 @@ init()\n-line2\n+new_line2\nline3' + patch_str = '@@ -2,2 +2,2 @@ init()\n-line2\n+new_line2\n line3' num_lines = 1 - expected_output = '@@ -1,4 +1,4 @@ init()\nline1\n-line2\n+new_line2\nline3\nline4' - actual_output = extend_patch(original_file_str, patch_str, num_lines) + expected_output = '\n@@ -1,4 +1,4 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4' + actual_output = extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) assert actual_output == expected_output # Tests that the function returns an empty string when patch_str is empty @@ -57,14 +25,16 @@ def test_empty_patch(self): patch_str = '' num_lines = 1 expected_output = '' - assert extend_patch(original_file_str, patch_str, num_lines) == expected_output + assert extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) == expected_output # Tests that the function returns the original patch when num_lines is 0 def test_zero_num_lines(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' patch_str = '@@ -2,2 +2,2 @@ init()\n-line2\n+new_line2\nline3' num_lines = 0 - assert extend_patch(original_file_str, patch_str, num_lines) == patch_str + assert extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) == patch_str # Tests that the function returns the original patch when patch_str contains no hunks def test_no_hunks(self): @@ -77,17 +47,111 @@ def test_no_hunks(self): # Tests that the function extends a patch with a single hunk correctly def test_single_hunk(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' - patch_str = '@@ -2,3 +2,3 @@ init()\n-line2\n+new_line2\nline3\nline4' - num_lines = 1 - expected_output = '@@ -1,5 +1,5 @@ init()\nline1\n-line2\n+new_line2\nline3\nline4\nline5' - actual_output = extend_patch(original_file_str, patch_str, num_lines) - assert actual_output == expected_output + patch_str = '@@ -2,3 +2,3 @@ init()\n-line2\n+new_line2\n line3\n line4' + + for num_lines in [1, 2, 3]: # check that even if we are over the number of lines in the file, the function still works + expected_output = '\n@@ -1,5 +1,5 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4\n line5' + actual_output = extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) + assert actual_output == expected_output # Tests the functionality of extending a patch with multiple hunks. def test_multiple_hunks(self): original_file_str = 'line1\nline2\nline3\nline4\nline5\nline6' - patch_str = '@@ -2,3 +2,3 @@ init()\n-line2\n+new_line2\nline3\nline4\n@@ -4,1 +4,1 @@ init2()\n-line4\n+new_line4' # noqa: E501 + patch_str = '@@ -2,3 +2,3 @@ init()\n-line2\n+new_line2\n line3\n line4\n@@ -4,1 +4,1 @@ init2()\n-line4\n+new_line4' # noqa: E501 num_lines = 1 - expected_output = '@@ -1,5 +1,5 @@ init()\nline1\n-line2\n+new_line2\nline3\nline4\nline5\n@@ -3,3 +3,3 @@ init2()\nline3\n-line4\n+new_line4\nline5' # noqa: E501 - actual_output = extend_patch(original_file_str, patch_str, num_lines) + original_allow_dynamic_context = get_settings().config.allow_dynamic_context + + get_settings().config.allow_dynamic_context = False + expected_output = '\n@@ -1,5 +1,5 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4\n line5\n\n@@ -3,3 +3,3 @@ init2()\n line3\n-line4\n+new_line4\n line5' # noqa: E501 + actual_output = extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) assert actual_output == expected_output + + get_settings().config.allow_dynamic_context = True + expected_output = '\n@@ -1,5 +1,5 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4\n line5\n\n@@ -3,3 +3,3 @@ init2()\n line3\n-line4\n+new_line4\n line5' # noqa: E501 + actual_output = extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) + assert actual_output == expected_output + get_settings().config.allow_dynamic_context = original_allow_dynamic_context + + + def test_dynamic_context(self): + get_settings().config.max_extra_lines_before_dynamic_context = 10 + original_file_str = "def foo():" + for i in range(9): + original_file_str += f"\n line({i})" + patch_str ="@@ -11,1 +11,1 @@ def foo():\n- line(9)\n+ new_line(9)" + num_lines=1 + + get_settings().config.allow_dynamic_context = True + actual_output = extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) + expected_output='\n@@ -1,10 +1,10 @@ \n def foo():\n line(0)\n line(1)\n line(2)\n line(3)\n line(4)\n line(5)\n line(6)\n line(7)\n line(8)\n- line(9)\n+ new_line(9)' + assert actual_output == expected_output + + get_settings().config.allow_dynamic_context = False + actual_output2 = extend_patch(original_file_str, patch_str, + patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) + expected_output_no_dynamic_context = '\n@@ -10,1 +10,1 @@ def foo():\n line(8)\n- line(9)\n+ new_line(9)' + assert actual_output2 == expected_output_no_dynamic_context + + + + + +class TestExtendedPatchMoreLines: + def setUp(self): + get_settings().config.allow_dynamic_context = False + + class File: + def __init__(self, base_file, patch, filename, ai_file_summary=None): + self.base_file = base_file + self.patch = patch + self.filename = filename + self.ai_file_summary = ai_file_summary + + @pytest.fixture + def token_handler(self): + # Create a TokenHandler instance with dummy data + th = TokenHandler(system="System prompt", user="User prompt") + th.prompt_tokens = 100 + return th + + @pytest.fixture + def pr_languages(self): + # Create a list of languages with files containing base_file and patch data + return [ + { + 'files': [ + self.File(base_file="line000\nline00\nline0\nline1\noriginal content\nline2\nline3\nline4\nline5\nline6\nline7\nline8\nline9\nline10", + patch="@@ -5,5 +5,5 @@\n-original content\n+modified content\n line2\n line3\n line4\n line5", + filename="file1"), + self.File(base_file="original content\nline2\nline3\nline4\nline5\nline6\nline7\nline8\nline9\nline10", + patch="@@ -6,5 +6,5 @@\nline6\nline7\nline8\n-line9\n+modified line9\nline10", + filename="file2") + ] + } + ] + + def test_extend_patches_with_extra_lines(self, token_handler, pr_languages): + patches_extended_no_extra_lines, total_tokens, patches_extended_tokens = pr_generate_extended_diff( + pr_languages, token_handler, add_line_numbers_to_hunks=False, + patch_extra_lines_before=0, + patch_extra_lines_after=0 + ) + + # Check that with no extra lines, the patches are the same as the original patches + p0 = patches_extended_no_extra_lines[0].strip() + p1 = patches_extended_no_extra_lines[1].strip() + assert p0 == '## file1\n' + pr_languages[0]['files'][0].patch.strip() + assert p1 == '## file2\n' + pr_languages[0]['files'][1].patch.strip() + + patches_extended_with_extra_lines, total_tokens, patches_extended_tokens = pr_generate_extended_diff( + pr_languages, token_handler, add_line_numbers_to_hunks=False, + patch_extra_lines_before=2, + patch_extra_lines_after=1 + ) + + p0_extended = patches_extended_with_extra_lines[0].strip() + assert p0_extended == '## file1\n\n@@ -3,8 +3,8 @@ \n line0\n line1\n-original content\n+modified content\n line2\n line3\n line4\n line5\n line6' diff --git a/tests/unittest/test_file_filter.py b/tests/unittest/test_file_filter.py new file mode 100644 index 000000000..43e9c9b4f --- /dev/null +++ b/tests/unittest/test_file_filter.py @@ -0,0 +1,80 @@ +import pytest +from pr_agent.algo.file_filter import filter_ignored +from pr_agent.config_loader import global_settings + +class TestIgnoreFilter: + def test_no_ignores(self): + """ + Test no files are ignored when no patterns are specified. + """ + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + assert filter_ignored(files) == files, "Expected all files to be returned when no ignore patterns are given." + + def test_glob_ignores(self, monkeypatch): + """ + Test files are ignored when glob patterns are specified. + """ + monkeypatch.setattr(global_settings.ignore, 'glob', ['*.py']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[1], + files[2] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." + + def test_regex_ignores(self, monkeypatch): + """ + Test files are ignored when regex patterns are specified. + """ + monkeypatch.setattr(global_settings.ignore, 'regex', ['^file[2-4]\..*$']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[0], + files[4] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." + + def test_invalid_regex(self, monkeypatch): + """ + Test invalid patterns are quietly ignored. + """ + monkeypatch.setattr(global_settings.ignore, 'regex', ['(((||', '^file[2-4]\..*$']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[0], + files[4] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." diff --git a/tests/unittest/test_find_line_number_of_relevant_line_in_file.py b/tests/unittest/test_find_line_number_of_relevant_line_in_file.py index 7488c6dff..fcb028cae 100644 --- a/tests/unittest/test_find_line_number_of_relevant_line_in_file.py +++ b/tests/unittest/test_find_line_number_of_relevant_line_in_file.py @@ -1,8 +1,7 @@ # Generated by CodiumAI -from pr_agent.git_providers.git_provider import FilePatchInfo -from pr_agent.algo.pr_processing import find_line_number_of_relevant_line_in_file - +from pr_agent.algo.types import FilePatchInfo +from pr_agent.algo.utils import find_line_number_of_relevant_line_in_file import pytest diff --git a/tests/unittest/test_github_action_output.py b/tests/unittest/test_github_action_output.py new file mode 100644 index 000000000..2b8e0db19 --- /dev/null +++ b/tests/unittest/test_github_action_output.py @@ -0,0 +1,50 @@ +import os +import json +from pr_agent.algo.utils import get_settings, github_action_output + +class TestGitHubOutput: + def test_github_action_output_enabled(self, monkeypatch, tmp_path): + get_settings().set('GITHUB_ACTION_CONFIG.ENABLE_OUTPUT', True) + monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) + output_data = {'key1': {'value1': 1, 'value2': 2}} + key_name = 'key1' + + github_action_output(output_data, key_name) + + with open(str(tmp_path / 'output'), 'r') as f: + env_value = f.read() + + actual_key = env_value.split('=')[0] + actual_data = json.loads(env_value.split('=')[1]) + + assert actual_key == key_name + assert actual_data == output_data[key_name] + + def test_github_action_output_disabled(self, monkeypatch, tmp_path): + get_settings().set('GITHUB_ACTION_CONFIG.ENABLE_OUTPUT', False) + monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) + output_data = {'key1': {'value1': 1, 'value2': 2}} + key_name = 'key1' + + github_action_output(output_data, key_name) + + assert not os.path.exists(str(tmp_path / 'output')) + + def test_github_action_output_notset(self, monkeypatch, tmp_path): + # not set config + monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) + output_data = {'key1': {'value1': 1, 'value2': 2}} + key_name = 'key1' + + github_action_output(output_data, key_name) + + assert not os.path.exists(str(tmp_path / 'output')) + + def test_github_action_output_error_case(self, monkeypatch, tmp_path): + monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) + output_data = None # invalid data + key_name = 'key1' + + github_action_output(output_data, key_name) + + assert not os.path.exists(str(tmp_path / 'output')) \ No newline at end of file diff --git a/tests/unittest/test_handle_patch_deletions.py b/tests/unittest/test_handle_patch_deletions.py index 152ea4b23..e44c0d771 100644 --- a/tests/unittest/test_handle_patch_deletions.py +++ b/tests/unittest/test_handle_patch_deletions.py @@ -43,18 +43,6 @@ def test_handle_patch_deletions_happy_path_new_file_content_exists(self): assert handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file_name) == patch.rstrip() - # Tests that handle_patch_deletions logs a message when verbosity_level is greater than 0 - def test_handle_patch_deletions_happy_path_verbosity_level_greater_than_0(self, caplog): - patch = '--- a/file.py\n+++ b/file.py\n@@ -1,2 +1,2 @@\n-foo\n-bar\n+baz\n' - original_file_content_str = 'foo\nbar\n' - new_file_content_str = '' - file_name = 'file.py' - get_settings().config.verbosity_level = 1 - - with caplog.at_level(logging.INFO): - handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file_name) - assert any("Processing file" in message for message in caplog.messages) - # Tests that handle_patch_deletions returns 'File was deleted' when new_file_content_str is empty def test_handle_patch_deletions_edge_case_new_file_content_empty(self): patch = '--- a/file.py\n+++ b/file.py\n@@ -1,2 +1,2 @@\n-foo\n-bar\n' diff --git a/tests/unittest/test_language_handler.py b/tests/unittest/test_language_handler.py index 875ec1a72..fdde7bb03 100644 --- a/tests/unittest/test_language_handler.py +++ b/tests/unittest/test_language_handler.py @@ -61,7 +61,7 @@ def test_edge_case_empty_languages(self): type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})() ] - expected_output = [{'language': 'Other', 'files': []}] + expected_output = [{'language': 'Other', 'files': files}] assert sort_files_by_main_languages(languages, files) == expected_output # Tests that function handles empty files list diff --git a/tests/unittest/test_load_yaml.py b/tests/unittest/test_load_yaml.py index a345aee23..17d469360 100644 --- a/tests/unittest/test_load_yaml.py +++ b/tests/unittest/test_load_yaml.py @@ -2,6 +2,9 @@ # Generated by CodiumAI import pytest +import yaml +from yaml.scanner import ScannerError + from pr_agent.algo.utils import load_yaml @@ -12,13 +15,13 @@ def test_load_valid_yaml(self): expected_output = {'name': 'John Smith', 'age': 35} assert load_yaml(yaml_str) == expected_output - def test_load_complicated_yaml(self): + def test_load_invalid_yaml1(self): yaml_str = \ '''\ PR Analysis: Main theme: Enhancing the `/describe` command prompt by adding title and description Type of PR: Enhancement - Relevant tests added: No + Relevant tests: No Focused PR: Yes, the PR is focused on enhancing the `/describe` command prompt. PR Feedback: @@ -26,7 +29,25 @@ def test_load_complicated_yaml(self): Code feedback: - relevant file: pr_agent/settings/pr_description_prompts.toml suggestion: Consider using a more descriptive variable name than 'user' for the command prompt. A more descriptive name would make the code more readable and maintainable. [medium] - relevant line: 'user="""PR Info:' + relevant line: user="""PR Info: aaa Security concerns: No''' - expected_output = {'PR Analysis': {'Main theme': 'Enhancing the `/describe` command prompt by adding title and description', 'Type of PR': 'Enhancement', 'Relevant tests added': False, 'Focused PR': 'Yes, the PR is focused on enhancing the `/describe` command prompt.'}, 'PR Feedback': {'General suggestions': 'The PR seems to be well-structured and focused on a specific enhancement. However, it would be beneficial to add tests to ensure the new feature works as expected.', 'Code feedback': [{'relevant file': 'pr_agent/settings/pr_description_prompts.toml', 'suggestion': "Consider using a more descriptive variable name than 'user' for the command prompt. A more descriptive name would make the code more readable and maintainable. [medium]", 'relevant line': 'user="""PR Info:'}], 'Security concerns': False}} + with pytest.raises(ScannerError): + yaml.safe_load(yaml_str) + + expected_output = {'PR Analysis': {'Main theme': 'Enhancing the `/describe` command prompt by adding title and description', 'Type of PR': 'Enhancement', 'Relevant tests': False, 'Focused PR': 'Yes, the PR is focused on enhancing the `/describe` command prompt.'}, 'PR Feedback': {'General suggestions': 'The PR seems to be well-structured and focused on a specific enhancement. However, it would be beneficial to add tests to ensure the new feature works as expected.', 'Code feedback': [{'relevant file': 'pr_agent/settings/pr_description_prompts.toml\n', 'suggestion': "Consider using a more descriptive variable name than 'user' for the command prompt. A more descriptive name would make the code more readable and maintainable. [medium]", 'relevant line': 'user="""PR Info: aaa\n'}], 'Security concerns': False}} + assert load_yaml(yaml_str) == expected_output + + def test_load_invalid_yaml2(self): + yaml_str = '''\ +- relevant file: src/app.py: + suggestion content: The print statement is outside inside the if __name__ ==: \ +''' + with pytest.raises(ScannerError): + yaml.safe_load(yaml_str) + + expected_output = [{'relevant file': 'src/app.py:\n', 'suggestion content': 'The print statement is outside inside the if __name__ ==:'}] assert load_yaml(yaml_str) == expected_output + + + + diff --git a/tests/unittest/test_parse_code_suggestion.py b/tests/unittest/test_parse_code_suggestion.py index aaa03f72f..5ffd9f3e1 100644 --- a/tests/unittest/test_parse_code_suggestion.py +++ b/tests/unittest/test_parse_code_suggestion.py @@ -61,7 +61,7 @@ def test_no_code_example_key(self): 'before': 'Before 1', 'after': 'After 1' } - expected_output = " **suggestion:** Suggestion 1\n **description:** Description 1\n **before:** Before 1\n **after:** After 1\n\n" # noqa: E501 + expected_output = ' **suggestion:** Suggestion 1 \n **description:** Description 1 \n **before:** Before 1 \n **after:** After 1 \n\n' # noqa: E501 assert parse_code_suggestion(code_suggestions) == expected_output # Tests that function returns correct output when input dictionary has 'code example' key @@ -74,5 +74,5 @@ def test_with_code_example_key(self): 'after': 'After 2' } } - expected_output = " **suggestion:** Suggestion 2\n **description:** Description 2\n - **code example:**\n - **before:**\n ```\n Before 2\n ```\n - **after:**\n ```\n After 2\n ```\n\n" # noqa: E501 + expected_output = ' **suggestion:** Suggestion 2 \n **description:** Description 2 \n - **code example:**\n - **before:**\n ```\n Before 2\n ```\n - **after:**\n ```\n After 2\n ```\n\n' # noqa: E501 assert parse_code_suggestion(code_suggestions) == expected_output diff --git a/tests/unittest/test_try_fix_yaml.py b/tests/unittest/test_try_fix_yaml.py new file mode 100644 index 000000000..90fa63f12 --- /dev/null +++ b/tests/unittest/test_try_fix_yaml.py @@ -0,0 +1,90 @@ + +# Generated by CodiumAI +from pr_agent.algo.utils import try_fix_yaml + + +import pytest + +class TestTryFixYaml: + + # The function successfully parses a valid YAML string. + def test_valid_yaml(self): + review_text = "key: value\n" + expected_output = {"key": "value"} + assert try_fix_yaml(review_text) == expected_output + + # The function adds '|-' to 'relevant line:' if it is not already present and successfully parses the YAML string. + def test_add_relevant_line(self): + review_text = "relevant line: value: 3\n" + expected_output = {'relevant line': 'value: 3\n'} + assert try_fix_yaml(review_text) == expected_output + + # The function extracts YAML snippet + def test_extract_snippet(self): + review_text = '''\ +Here is the answer in YAML format: + +```yaml +name: John Smith +age: 35 +``` +''' + expected_output = {'name': 'John Smith', 'age': 35} + assert try_fix_yaml(review_text) == expected_output + + # The function removes the last line(s) of the YAML string and successfully parses the YAML string. + def test_remove_last_line(self): + review_text = "key: value\nextra invalid line\n" + expected_output = {"key": "value"} + assert try_fix_yaml(review_text) == expected_output + + # The YAML string is empty. + def test_empty_yaml_fixed(self): + review_text = "" + assert try_fix_yaml(review_text) is None + + + # The function extracts YAML snippet + def test_no_initial_yaml(self): + review_text = '''\ +I suggest the following: + +code_suggestions: +- relevant_file: | + src/index.ts + label: | + best practice + +- relevant_file: | + src/index2.ts + label: | + enhancment +``` + +We can further improve the code by using the `const` keyword instead of `var` in the `src/index.ts` file. +''' + expected_output = {'code_suggestions': [{'relevant_file': 'src/index.ts\n', 'label': 'best practice\n'}, {'relevant_file': 'src/index2.ts\n', 'label': 'enhancment'}]} + + assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='label') == expected_output + + def test_with_initial_yaml(self): + review_text = '''\ +I suggest the following: + +``` +code_suggestions: +- relevant_file: | + src/index.ts + label: | + best practice + +- relevant_file: | + src/index2.ts + label: | + enhancment +``` + +We can further improve the code by using the `const` keyword instead of `var` in the `src/index.ts` file. +''' + expected_output = {'code_suggestions': [{'relevant_file': 'src/index.ts\n', 'label': 'best practice\n'}, {'relevant_file': 'src/index2.ts\n', 'label': 'enhancment'}]} + assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='label') == expected_output \ No newline at end of file