From aa1778132e04415dcbb0c48aeb93c2b1d82cd9fb Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Sat, 1 Jun 2024 01:03:38 +0200 Subject: [PATCH 1/8] commit the model usage, missing model load --- .github/workflows/assign_issue.yml | 34 +++++++ tasks/issue.py | 21 ++++ tasks/libs/common/utils.py | 2 - tasks/libs/issue/assign.py | 146 ++++++++++++++++++++++++++++ tasks/libs/issue/model/actions.py | 117 ++++++++++++++++++++++ tasks/libs/issue/model/constants.py | 42 ++++++++ 6 files changed, 360 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/assign_issue.yml create mode 100644 tasks/issue.py create mode 100644 tasks/libs/issue/assign.py create mode 100644 tasks/libs/issue/model/actions.py create mode 100644 tasks/libs/issue/model/constants.py diff --git a/.github/workflows/assign_issue.yml b/.github/workflows/assign_issue.yml new file mode 100644 index 0000000000000..bd8f4a94458ad --- /dev/null +++ b/.github/workflows/assign_issue.yml @@ -0,0 +1,34 @@ +--- +name: "Assign issue to a team" + +on: + issues: + types: [opened, reopened] + +jobs: + auto_assign_issue: + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - name: Checkout repository + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 + with: + ref: ${{ github.head_ref }} + - name: Install python + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: 3.11 + cache: "pip" + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt -r tasks/libs/requirements-github.txt + pip install pytorch==1.0.2 transformers==4.41.2 + - name: Assign issue + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + team=`inv -e issue.assign --issue ${{ github.event.issue.number }}` + echo "Issue ${{ github.event.issue.number }} assigned to $team" + gh issue edit "${{ github.event.issue.number }}" --add-label "team/$team" diff --git a/tasks/issue.py b/tasks/issue.py new file mode 100644 index 0000000000000..cd0d2d9005110 --- /dev/null +++ b/tasks/issue.py @@ -0,0 +1,21 @@ +from invoke import task + +from tasks.libs.ciproviders.github_api import GithubAPI +from tasks.libs.issue.assign import assign_with_model, assign_with_rules +from tasks.libs.issue.model.actions import generate_model + + +@task +def assign_owner(_, issue_id): + gh = GithubAPI('DataDog/datadog-agent') + issue = gh.repo.get_issue(int(issue_id)) + owner, confidence = assign_with_model(issue) + if confidence < 0.5: + owner = assign_with_rules(issue, gh) + print(owner) + return owner + + +@task +def generate_the_model(_): + generate_model() diff --git a/tasks/libs/common/utils.py b/tasks/libs/common/utils.py index 1bbd88910a8e5..a2643c3bce2a1 100644 --- a/tasks/libs/common/utils.py +++ b/tasks/libs/common/utils.py @@ -11,11 +11,9 @@ import tempfile import time import traceback -from collections import Counter from contextlib import contextmanager from dataclasses import dataclass from functools import wraps -from pathlib import Path from subprocess import CalledProcessError, check_output from types import SimpleNamespace diff --git a/tasks/libs/issue/assign.py b/tasks/libs/issue/assign.py new file mode 100644 index 0000000000000..7c196070cbc7f --- /dev/null +++ b/tasks/libs/issue/assign.py @@ -0,0 +1,146 @@ +from collections import Counter +from pathlib import Path + +from tasks.libs.ciproviders.github_api import get_github_teams +from tasks.libs.issue.model.constants import BASE_MODEL, MODEL, TEAMS +from tasks.libs.owners.parsing import most_frequent_agent_team, search_owners + + +def assign_with_model(issue): + import torch + from transformers import AutoModelForSequenceClassification, AutoTokenizer + + m = AutoModelForSequenceClassification.from_pretrained( + f"{MODEL}", ignore_mismatched_sizes=True, local_files_only=True + ) + m.eval() + tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) + inputs = tokenizer( + f"{issue.title} {issue.body}".casefold(), + padding='max_length', + truncation=True, + max_length=64, + return_tensors='pt', + ) + with torch.no_grad(): + outputs = m(**inputs) + logits = outputs.logits + proba = torch.softmax(logits, dim=1) + predicted_class = torch.argmax(proba).item() + confidence = proba[0][predicted_class].item() + return TEAMS[torch.argmax(outputs.logits).item()], confidence + + +def assign_with_rules(issue, gh): + owner = guess_from_labels(issue) + if owner == 'triage': + users = [user for user in issue.assignees if gh.is_organization_member(user)] + teams = get_github_teams(users) + owner = most_frequent_agent_team(teams) + if owner == 'triage': + commenters = [c.user for c in issue.get_comments() if gh.is_organization_member(c.user)] + teams = get_github_teams(commenters) + owner = most_frequent_agent_team(teams) + if owner == 'triage': + owner = guess_from_keywords(issue) + return team_to_label(owner) + + +def guess_from_labels(issue): + for label in issue.labels: + if label.name.startswith("team/") and "triage" not in label.name: + return label.name.split("/")[-1] + return 'triage' + + +def guess_from_keywords(issue): + text = f"{issue.title} {issue.body}".casefold().split() + c = Counter(text) + for word in c.most_common(): + team = simple_match(word[0]) + if team: + return team + team = file_match(word[0]) + if team: + return team + return "triage" + + +def simple_match(word): + pattern_matching = { + "agent-apm": ['apm', 'java', 'dotnet', 'ruby', 'trace'], + "containers": [ + 'container', + 'pod', + 'kubernetes', + 'orchestrator', + 'docker', + 'k8s', + 'kube', + 'cluster', + 'kubelet', + 'helm', + ], + "agent-metrics-logs": ['logs', 'metric', 'log-ag', 'statsd', 'tags', 'hostnam'], + "agent-build-and-releases": ['omnibus', 'packaging', 'script'], + "remote-config": ['installer', 'oci'], + "agent-cspm": ['cspm'], + "ebpf-platform": ['ebpf', 'system-prob', 'sys-prob'], + "agent-security": ['security', 'vuln', 'security-agent'], + "agent-shared-components": ['fips', 'inventory', 'payload', 'jmx', 'intak', 'gohai'], + "fleet": ['fleet', 'fleet-automation'], + "opentelemetry": ['otel', 'opentelemetry'], + "windows-agent": ['windows', 'sys32', 'powershell'], + "networks": ['tcp', 'udp', 'socket', 'network'], + "serverless": ['serverless'], + "integrations": ['integration', 'python', 'checks'], + } + for team, words in pattern_matching.items(): + if any(w in word for w in words): + return team + return None + + +def file_match(word): + dd_folders = [ + 'chocolatey', + 'cmd', + 'comp', + 'dev', + 'devenv', + 'docs', + 'internal', + 'omnibus', + 'pkg', + 'pkg-config', + 'rtloader', + 'tasks', + 'test', + 'tools', + ] + p = Path(word) + if len(p.parts) > 1 and p.suffix: + path_folder = next((f for f in dd_folders if f in p.parts), None) + if path_folder: + file = '/'.join(p.parts[p.parts.index(path_folder) :]) + return ( + search_owners(file, ".github/CODEOWNERS")[0].casefold().replace("@datadog/", "") + ) # only return the first owner + return None + + +def team_to_label(team): + dico = { + 'apm-core-reliability-and-performance': "agent-apm", + 'universal-service-monitoring': "usm", + 'software-integrity-and-trust': "agent-security", + 'agent-all': "triage", + 'telemetry-and-analytics': "agent-apm", + 'fleet': "fleet-automation", + 'debugger': "dynamic-intrumentation", + 'container-integrations': "containers", + 'agent-e2e-testing': "agent-e2e-test", + 'agent-integrations': "integrations", + 'asm-go': "agent-security", + } + return dico.get(team, team) diff --git a/tasks/libs/issue/model/actions.py b/tasks/libs/issue/model/actions.py new file mode 100644 index 0000000000000..265166e6b2055 --- /dev/null +++ b/tasks/libs/issue/model/actions.py @@ -0,0 +1,117 @@ +from time import sleep + +from tasks.libs.ciproviders.github_api import GithubAPI +from tasks.libs.issue.assign import assign_with_rules +from tasks.libs.issue.model.constants import BASE_MODEL, MODEL, TEAMS + + +def generate_model(): + gh = GithubAPI('DataDog/datadog-agent') + d = gh.repo + issues = [] + teams = [] + n = 0 + for i in d.get_issues(state='all'): + issues.append(f"{i.title} {i.body}".casefold()) + teams.append(assign_with_rules(i, gh)) + # Sleep to avoid hitting the rate limit + n += 1 + if n % 2000 == 0: + sleep(3600) + + train_the_model(teams, issues, "issue_auto_assign_model", 64, 5) + + +def train_the_model(teams, issues, batch_size, epochs): + import torch + from sklearn.model_selection import train_test_split + from torch.utils.data import DataLoader, Dataset + from transformers import AutoModelForSequenceClassification, AutoTokenizer + + class IssueDataset(Dataset): + def __init__(self, issues, labels, tokenizer, max_length=64): + self.issues = issues + self.labels = labels + self.tokenizer = tokenizer + self.max_length = max_length + + def __len__(self): + return len(self.issues) + + def __getitem__(self, idx): + issue = self.issues[idx] + label = self.labels[idx] + inputs = self.tokenizer( + issue, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt" + ) + return { + "input_ids": inputs["input_ids"].flatten(), + "attention_mask": inputs["attention_mask"].flatten(), + "labels": torch.tensor(label, dtype=torch.long), + } + + # Split the dataset into training and validation sets + train_issues, val_issues, train_teams, val_teams = train_test_split(issues, teams, test_size=0.2, random_state=42) + + # Define hyperparameters + learning_rate = 2e-5 + + # Load pre-trained BERT model and tokenizer + tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) + model = AutoModelForSequenceClassification.from_pretrained( + BASE_MODEL, num_labels=len(set(teams)), ignore_mismatched_sizes=True + ) + + # Prepare dataset and dataloaders + train_teams = [TEAMS.index(t) for t in train_teams] + val_teams = [TEAMS.index(t) for t in val_teams] + train_dataset = IssueDataset(train_issues, train_teams, tokenizer, max_length=batch_size) + val_dataset = IssueDataset(val_issues, val_teams, tokenizer, max_length=batch_size) + print(f"set sizes : {len(train_dataset)} {len(val_dataset)} {len(set(teams))}") + + print(f"train_dataset {train_dataset[0]}") + print(f"train_dataset {train_dataset[1]}") + train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + val_loader = DataLoader(val_dataset, batch_size=batch_size) + + # Define optimizer and loss function + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) + + print("Start training...") + # Fine-tune the model + for epoch in range(epochs): + print(f"Epoch {epoch+1}/{epochs}") + model.train() + train_loss = 0.0 + for batch in train_loader: + optimizer.zero_grad() + input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels'] + outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) + loss = outputs.loss + train_loss += loss.item() + loss.backward() + optimizer.step() + train_loss /= len(train_loader) + + # Evaluate on validation set + model.eval() + val_loss = 0.0 + correct = 0 + total = 0 + print("validate") + with torch.no_grad(): + for batch in val_loader: + input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels'] + outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) + loss = outputs.loss + val_loss += loss.item() + _, predicted = torch.max(outputs.logits, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + val_loss /= len(val_loader) + val_accuracy = correct / total + + print( + f"Epoch {epoch+1}/{epochs}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}" + ) + model.save_pretrained(MODEL) diff --git a/tasks/libs/issue/model/constants.py b/tasks/libs/issue/model/constants.py new file mode 100644 index 0000000000000..67ec1d5535109 --- /dev/null +++ b/tasks/libs/issue/model/constants.py @@ -0,0 +1,42 @@ +MODEL = "issue_auto_assign_model" +BASE_MODEL = "distilbert-base-uncased-finetuned-sst-2-english" +TEAMS = ( + 'container-ecosystems', + 'windows-agent', + 'remote-config', + 'container-platform', + 'documentation', + 'agent-security', + 'container-app', + 'agent-all', + 'processes', + 'agent-platform', + 'agent-release-management', + 'networks', + 'ebpf-platform', + 'agent-apm', + 'single-machine-performance', + 'agent-e2e-testing', + 'agent-developer-tools', + 'triage', + 'windows-kernel-integrations', + 'container-integrations', + 'software-integrity-and-trust', + 'opentelemetry', + 'universal-service-monitoring', + 'agent-build-and-releases', + 'agent-shared-components', + 'agent-integrations', + 'agent-metrics-logs', + 'platform-integrations', + 'agent-ci-experience', + 'asm-go', + 'agent-cspm', + 'debugger', + 'database-monitoring', + 'network-device-monitoring', + 'serverless', + 'apm-onboarding', + 'fleet', + 'agent-processing-and-routing', +) From 352de06ec67354e8b9437b6550cb1eb9670ad409 Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Mon, 2 Dec 2024 14:47:15 +0100 Subject: [PATCH 2/8] Run the job on a container with the model and required dependencies --- .github/workflows/assign_issue.yml | 15 ++------------- tasks/__init__.py | 2 ++ tasks/issue.py | 20 ++++++++++++++++++-- tasks/libs/issue/model/constants.py | 2 +- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/.github/workflows/assign_issue.yml b/.github/workflows/assign_issue.yml index bd8f4a94458ad..dc52cad820600 100644 --- a/.github/workflows/assign_issue.yml +++ b/.github/workflows/assign_issue.yml @@ -8,6 +8,7 @@ on: jobs: auto_assign_issue: runs-on: ubuntu-latest + container: ghcr.io/datadog/agent-issue-auto-assign:latest permissions: issues: write steps: @@ -15,20 +16,8 @@ jobs: uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 with: ref: ${{ github.head_ref }} - - name: Install python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 - with: - python-version: 3.11 - cache: "pip" - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt -r tasks/libs/requirements-github.txt - pip install pytorch==1.0.2 transformers==4.41.2 - name: Assign issue env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - team=`inv -e issue.assign --issue ${{ github.event.issue.number }}` - echo "Issue ${{ github.event.issue.number }} assigned to $team" - gh issue edit "${{ github.event.issue.number }}" --add-label "team/$team" + team=`inv -e issue.assign-owner --issue ${{ github.event.issue.number }}` diff --git a/tasks/__init__.py b/tasks/__init__.py index 93ce84f1c37b2..3ae9ae75f6749 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -35,6 +35,7 @@ go_deps, installer, invoke_unit_tests, + issue, kmt, linter, modules, @@ -180,6 +181,7 @@ ns.add_collection(git) ns.add_collection(github_tasks, "github") ns.add_collection(gitlab_helpers, "gitlab") +ns.add_collection(issue) ns.add_collection(package) ns.add_collection(pipeline) ns.add_collection(notes) diff --git a/tasks/issue.py b/tasks/issue.py index cd0d2d9005110..edbef166ca920 100644 --- a/tasks/issue.py +++ b/tasks/issue.py @@ -1,18 +1,34 @@ +import os + from invoke import task from tasks.libs.ciproviders.github_api import GithubAPI from tasks.libs.issue.assign import assign_with_model, assign_with_rules from tasks.libs.issue.model.actions import generate_model +from tasks.libs.pipeline.notifications import GITHUB_SLACK_MAP @task -def assign_owner(_, issue_id): +def assign_owner(_, issue_id, dry_run=False): gh = GithubAPI('DataDog/datadog-agent') issue = gh.repo.get_issue(int(issue_id)) + assignment = "model" owner, confidence = assign_with_model(issue) if confidence < 0.5: + assignment = "rules" owner = assign_with_rules(issue, gh) - print(owner) + print(f"Issue assigned to team/{owner} with {assignment}") + if not dry_run: + # Edit issue label + issue.add_to_labels(f"team/{owner}") + # Post message + from slack_sdk import WebClient + + client = WebClient(os.environ['SLACK_API_TOKEN']) + channel = GITHUB_SLACK_MAP.get(owner.lower(), '#agent') + message = f':githubstatus_partial_outage: *New Community Issue*\n{issue.title} <{issue.html_url}|{gh.repo.name}#{issue_id}>' + message += "\nThe assignation to your team was done automatically, using issue content and title. Please redirect if needed." + client.chat_postMessage(channel=channel, text=message) return owner diff --git a/tasks/libs/issue/model/constants.py b/tasks/libs/issue/model/constants.py index 67ec1d5535109..986d201bcf138 100644 --- a/tasks/libs/issue/model/constants.py +++ b/tasks/libs/issue/model/constants.py @@ -1,4 +1,4 @@ -MODEL = "issue_auto_assign_model" +MODEL = "/issue_auto_assign_model" BASE_MODEL = "distilbert-base-uncased-finetuned-sst-2-english" TEAMS = ( 'container-ecosystems', From c9940f0fdaead1c7b9e407a9d041ccff2af71b1b Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Mon, 2 Dec 2024 15:35:00 +0100 Subject: [PATCH 3/8] Move all issue-related code to an issue lib --- tasks/libs/common/utils.py | 100 -------------------------------- tasks/owners.py | 24 +------- tasks/unit_tests/issue_tests.py | 36 ++++++++++++ tasks/unit_tests/utils_tests.py | 35 +---------- 4 files changed, 38 insertions(+), 157 deletions(-) create mode 100644 tasks/unit_tests/issue_tests.py diff --git a/tasks/libs/common/utils.py b/tasks/libs/common/utils.py index a2643c3bce2a1..70eb60f20a5f6 100644 --- a/tasks/libs/common/utils.py +++ b/tasks/libs/common/utils.py @@ -24,7 +24,6 @@ from tasks.libs.common.color import Color, color_message from tasks.libs.common.constants import ALLOWED_REPO_ALL_BRANCHES, REPO_PATH from tasks.libs.common.git import get_commit_sha, get_default_branch -from tasks.libs.owners.parsing import search_owners from tasks.libs.releasing.version import get_version from tasks.libs.types.arch import Arch @@ -585,105 +584,6 @@ def parse_kernel_version(version: str) -> tuple[int, int, int, int]: return (int(match.group(1)), int(match.group(2)), int(match.group(4) or "0"), int(match.group(6) or "0")) -def guess_from_labels(issue): - for label in issue.labels: - if label.name.startswith("team/") and "triage" not in label.name: - return label.name.split("/")[-1] - return 'triage' - - -def guess_from_keywords(issue): - text = f"{issue.title} {issue.body}".casefold().split() - c = Counter(text) - for word in c.most_common(): - team = simple_match(word[0]) - if team: - return team - team = file_match(word[0]) - if team: - return team - return "triage" - - -def simple_match(word): - pattern_matching = { - "agent-apm": ['apm', 'java', 'dotnet', 'ruby', 'trace'], - "containers": [ - 'container', - 'pod', - 'kubernetes', - 'orchestrator', - 'docker', - 'k8s', - 'kube', - 'cluster', - 'kubelet', - 'helm', - ], - "agent-metrics-logs": ['logs', 'metric', 'log-ag', 'statsd', 'tags', 'hostnam'], - "agent-delivery": ['omnibus', 'packaging', 'script'], - "remote-config": ['installer', 'oci'], - "agent-cspm": ['cspm'], - "ebpf-platform": ['ebpf', 'system-prob', 'sys-prob'], - "agent-security": ['security', 'vuln', 'security-agent'], - "agent-shared-components": ['fips', 'inventory', 'payload', 'jmx', 'intak', 'gohai'], - "fleet": ['fleet', 'fleet-automation'], - "opentelemetry": ['otel', 'opentelemetry'], - "windows-agent": ['windows', 'sys32', 'powershell'], - "networks": ['tcp', 'udp', 'socket', 'network'], - "serverless": ['serverless'], - "integrations": ['integration', 'python', 'checks'], - } - for team, words in pattern_matching.items(): - if any(w in word for w in words): - return team - return None - - -def file_match(word): - dd_folders = [ - 'chocolatey', - 'cmd', - 'comp', - 'dev', - 'devenv', - 'docs', - 'internal', - 'omnibus', - 'pkg', - 'rtloader', - 'tasks', - 'test', - 'tools', - ] - p = Path(word) - if len(p.parts) > 1 and p.suffix: - path_folder = next((f for f in dd_folders if f in p.parts), None) - if path_folder: - file = '/'.join(p.parts[p.parts.index(path_folder) :]) - return ( - search_owners(file, ".github/CODEOWNERS")[0].casefold().replace("@datadog/", "") - ) # only return the first owner - return None - - -def team_to_label(team): - dico = { - 'apm-core-reliability-and-performance': "agent-apm", - 'universal-service-monitoring': "usm", - 'software-integrity-and-trust': "agent-security", - 'agent-all': "triage", - 'telemetry-and-analytics': "agent-apm", - 'fleet': "fleet-automation", - 'debugger': "dynamic-intrumentation", - 'container-integrations': "containers", - 'agent-e2e-testing': "agent-e2e-test", - 'agent-integrations': "integrations", - 'asm-go': "agent-security", - } - return dico.get(team, team) - - @contextmanager def download_to_tempfile(url, checksum=None): """ diff --git a/tasks/owners.py b/tasks/owners.py index 11c5f2c27d7ba..b2ecceffd581a 100644 --- a/tasks/owners.py +++ b/tasks/owners.py @@ -2,9 +2,7 @@ from invoke import task -from tasks.libs.ciproviders.github_api import GithubAPI, get_github_teams -from tasks.libs.common.utils import guess_from_keywords, guess_from_labels, team_to_label -from tasks.libs.owners.parsing import most_frequent_agent_team, read_owners, search_owners +from tasks.libs.owners.parsing import read_owners, search_owners from tasks.libs.pipeline.notifications import GITHUB_SLACK_MAP @@ -18,26 +16,6 @@ def find_codeowners(_, path, owners_file=".github/CODEOWNERS"): print(", ".join(search_owners(path, owners_file))) -@task -def guess_responsible(_, issue_id): - gh = GithubAPI('DataDog/datadog-agent') - issue = gh.repo.get_issue(int(issue_id)) - owner = guess_from_labels(issue) - if owner == 'triage': - users = [user for user in issue.assignees if gh.is_organization_member(user)] - teams = get_github_teams(users) - owner = most_frequent_agent_team(teams) - if owner == 'triage': - commenters = [c.user for c in issue.get_comments() if gh.is_organization_member(c.user)] - teams = get_github_teams(commenters) - owner = most_frequent_agent_team(teams) - if owner == 'triage': - owner = guess_from_keywords(issue) - owner = team_to_label(owner) - print(owner) - return owner - - def make_partition(names: list[str], owners_file: str, get_channels: bool = False) -> dict[str, set[str]]: """ From a list of job / file names, will create a dictionary with the teams as keys and the names as values. diff --git a/tasks/unit_tests/issue_tests.py b/tasks/unit_tests/issue_tests.py new file mode 100644 index 0000000000000..e72676fc50e3f --- /dev/null +++ b/tasks/unit_tests/issue_tests.py @@ -0,0 +1,36 @@ +import unittest +from unittest.mock import MagicMock + +from tasks.libs.issue.assign import guess_from_keywords, guess_from_labels + + +# We must define this class as we cannot override the name attribute in MagicMock +class Label: + def __init__(self, name): + self.name = name + + +class TestGuessFromLabels(unittest.TestCase): + def test_with_team(self): + issue = MagicMock(labels=[Label(name="team/triage"), Label(name="team/core")]) + + self.assertEqual(guess_from_labels(issue), "core") + + def test_without_team(self): + issue = MagicMock(labels=[Label(name="team/triage"), Label(name="team:burton")]) + + self.assertEqual(guess_from_labels(issue), "triage") + + +class TestGuessFromKeywords(unittest.TestCase): + def test_from_simple_match(self): + issue = MagicMock(title="I have an issue", body="I can't get any logs from the agent.") + self.assertEqual(guess_from_keywords(issue), "agent-metrics-logs") + + def test_with_a_file(self): + issue = MagicMock(title="fix bug", body="It comes from the file pkg/agent/build.py") + self.assertEqual(guess_from_keywords(issue), "agent-shared-components") + + def test_no_match(self): + issue = MagicMock(title="fix bug", body="It comes from the file... hm I don't know.") + self.assertEqual(guess_from_keywords(issue), "triage") diff --git a/tasks/unit_tests/utils_tests.py b/tasks/unit_tests/utils_tests.py index 02aae11c5fc80..3d867d240b2b9 100644 --- a/tasks/unit_tests/utils_tests.py +++ b/tasks/unit_tests/utils_tests.py @@ -1,7 +1,6 @@ import unittest -from unittest.mock import MagicMock -from tasks.libs.common.utils import clean_nested_paths, guess_from_keywords, guess_from_labels +from tasks.libs.common.utils import clean_nested_paths class TestUtils(unittest.TestCase): @@ -33,35 +32,3 @@ def test_clean_nested_paths_2(self): ] expected_paths = ["."] self.assertEqual(clean_nested_paths(paths), expected_paths) - - -# We must define this class as we cannot override the name attribute in MagicMock -class Label: - def __init__(self, name): - self.name = name - - -class TestGuessFromLabels(unittest.TestCase): - def test_with_team(self): - issue = MagicMock(labels=[Label(name="team/triage"), Label(name="team/core")]) - - self.assertEqual(guess_from_labels(issue), "core") - - def test_without_team(self): - issue = MagicMock(labels=[Label(name="team/triage"), Label(name="team:burton")]) - - self.assertEqual(guess_from_labels(issue), "triage") - - -class TestGuessFromKeywords(unittest.TestCase): - def test_from_simple_match(self): - issue = MagicMock(title="I have an issue", body="I can't get any logs from the agent.") - self.assertEqual(guess_from_keywords(issue), "agent-metrics-logs") - - def test_with_a_file(self): - issue = MagicMock(title="fix bug", body="It comes from the file pkg/agent/build.py") - self.assertEqual(guess_from_keywords(issue), "agent-shared-components") - - def test_no_match(self): - issue = MagicMock(title="fix bug", body="It comes from the file... hm I don't know.") - self.assertEqual(guess_from_keywords(issue), "triage") From 81b70ae710af145359411519ad7c9adbbc786913 Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Mon, 2 Dec 2024 15:37:09 +0100 Subject: [PATCH 4/8] Default to a shared channel --- tasks/issue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/issue.py b/tasks/issue.py index edbef166ca920..aea8810dae60c 100644 --- a/tasks/issue.py +++ b/tasks/issue.py @@ -25,7 +25,7 @@ def assign_owner(_, issue_id, dry_run=False): from slack_sdk import WebClient client = WebClient(os.environ['SLACK_API_TOKEN']) - channel = GITHUB_SLACK_MAP.get(owner.lower(), '#agent') + channel = GITHUB_SLACK_MAP.get(owner.lower(), '#agent-ask-anything') message = f':githubstatus_partial_outage: *New Community Issue*\n{issue.title} <{issue.html_url}|{gh.repo.name}#{issue_id}>' message += "\nThe assignation to your team was done automatically, using issue content and title. Please redirect if needed." client.chat_postMessage(channel=channel, text=message) From 4f12187af3809a9777056a814eb1369ddc87e943 Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Mon, 2 Dec 2024 16:28:26 +0100 Subject: [PATCH 5/8] codereview: Checkout main and beautify python code --- .github/workflows/assign_issue.yml | 2 -- tasks/issue.py | 6 +++--- tasks/libs/issue/model/actions.py | 12 +++++------- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/.github/workflows/assign_issue.yml b/.github/workflows/assign_issue.yml index dc52cad820600..0a8f155d91d5f 100644 --- a/.github/workflows/assign_issue.yml +++ b/.github/workflows/assign_issue.yml @@ -14,8 +14,6 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 - with: - ref: ${{ github.head_ref }} - name: Assign issue env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/tasks/issue.py b/tasks/issue.py index aea8810dae60c..3d3f80161fdd4 100644 --- a/tasks/issue.py +++ b/tasks/issue.py @@ -4,7 +4,7 @@ from tasks.libs.ciproviders.github_api import GithubAPI from tasks.libs.issue.assign import assign_with_model, assign_with_rules -from tasks.libs.issue.model.actions import generate_model +from tasks.libs.issue.model.actions import fetch_data_and_train_model from tasks.libs.pipeline.notifications import GITHUB_SLACK_MAP @@ -33,5 +33,5 @@ def assign_owner(_, issue_id, dry_run=False): @task -def generate_the_model(_): - generate_model() +def generate_model(_): + fetch_data_and_train_model() diff --git a/tasks/libs/issue/model/actions.py b/tasks/libs/issue/model/actions.py index 265166e6b2055..5e35a7a08f452 100644 --- a/tasks/libs/issue/model/actions.py +++ b/tasks/libs/issue/model/actions.py @@ -5,18 +5,16 @@ from tasks.libs.issue.model.constants import BASE_MODEL, MODEL, TEAMS -def generate_model(): +def fetch_data_and_train_model(): gh = GithubAPI('DataDog/datadog-agent') d = gh.repo issues = [] teams = [] - n = 0 - for i in d.get_issues(state='all'): - issues.append(f"{i.title} {i.body}".casefold()) - teams.append(assign_with_rules(i, gh)) + for id, issue in enumerate(d.get_issues(state='all')): + issues.append(f"{issue.title} {issue.body}".casefold()) + teams.append(assign_with_rules(issue, gh)) # Sleep to avoid hitting the rate limit - n += 1 - if n % 2000 == 0: + if id % 2000 == 0: sleep(3600) train_the_model(teams, issues, "issue_auto_assign_model", 64, 5) From 7d4c9d0da9f94453b4f4c8f08cb92ad5e55bbf71 Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Tue, 3 Dec 2024 10:19:00 +0100 Subject: [PATCH 6/8] add permissions to login into ghcr.io --- .github/workflows/assign_issue.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/assign_issue.yml b/.github/workflows/assign_issue.yml index 0a8f155d91d5f..7f99066b73af2 100644 --- a/.github/workflows/assign_issue.yml +++ b/.github/workflows/assign_issue.yml @@ -8,8 +8,13 @@ on: jobs: auto_assign_issue: runs-on: ubuntu-latest - container: ghcr.io/datadog/agent-issue-auto-assign:latest + container: + image: ghcr.io/datadog/agent-issue-auto-assign:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} permissions: + packages: read issues: write steps: - name: Checkout repository From 336b31b84f54c9f3c18c4cf86e464ae589b56bdc Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Tue, 3 Dec 2024 16:10:24 +0100 Subject: [PATCH 7/8] with a small image, install dependencies --- .github/workflows/assign_issue.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/assign_issue.yml b/.github/workflows/assign_issue.yml index 7f99066b73af2..b8e11289336ac 100644 --- a/.github/workflows/assign_issue.yml +++ b/.github/workflows/assign_issue.yml @@ -19,6 +19,10 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 + - name: Install dependencies + # Dependencies are installed at runtime. Otherwise it would create a huge image see https://hub.docker.com/r/pytorch/pytorch/tags + run: | + pip install --upgrade pip && pip install --no-compile --no-cache-dir torch transformers invoke codeowners slack-sdk PyGithub python-gitlab semver - name: Assign issue env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 22e287536c2ebbfcd1a03ec4a7d57d929b5e7ab2 Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Tue, 3 Dec 2024 16:23:04 +0100 Subject: [PATCH 8/8] fix github token name --- .github/workflows/assign_issue.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/assign_issue.yml b/.github/workflows/assign_issue.yml index b8e11289336ac..be9db6209604b 100644 --- a/.github/workflows/assign_issue.yml +++ b/.github/workflows/assign_issue.yml @@ -2,8 +2,8 @@ name: "Assign issue to a team" on: - issues: - types: [opened, reopened] + issues: + types: [opened, reopened] jobs: auto_assign_issue: @@ -25,6 +25,6 @@ jobs: pip install --upgrade pip && pip install --no-compile --no-cache-dir torch transformers invoke codeowners slack-sdk PyGithub python-gitlab semver - name: Assign issue env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - team=`inv -e issue.assign-owner --issue ${{ github.event.issue.number }}` + inv -e issue.assign-owner --issue ${{ github.event.issue.number }}