Skip to content

Commit

Permalink
Merge branch 'main' into SOLENG-974-check-driver-blacklisting
Browse files Browse the repository at this point in the history
  • Loading branch information
aieri authored Dec 13, 2024
2 parents 4724626 + a6e9391 commit 59ec5f1
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 125 deletions.
5 changes: 2 additions & 3 deletions .github/workflows/promote.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ jobs:
destination=$(echo "$channel_promotion" | sed 's/.*->\s*//')
echo "destination-channel=$destination" >> $GITHUB_OUTPUT
echo "origin-channel=$origin" >> $GITHUB_OUTPUT
- name: Release charm to channel
uses: canonical/charming-actions/release[email protected].2
- name: Promote charm to channel
uses: canonical/charming-actions/promote[email protected].3
with:
credentials: ${{ secrets.CHARMHUB_TOKEN }}
github-token: ${{ secrets.GITHUB_TOKEN }}
destination-channel: ${{ steps.set-channels.outputs.destination-channel }}
origin-channel: ${{ steps.set-channels.outputs.origin-channel }}
charmcraft-channel: "2.x/stable"
16 changes: 16 additions & 0 deletions SECURITY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!-- This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation -->
<!-- To update the file: -->
<!-- - Edit it in the canonical/solutions-engineering-automation repository. -->
<!-- - Open a PR with the changes. -->
<!-- - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes. -->

# Security policy


## Reporting a vulnerability
To report a security issue, file a [Private Security Report](https://github.com/canonical/hardware-observer-operator/security/advisories/new)
with a description of the issue, the steps you took to create the issue, affected versions, and,
if known, mitigations for the issue.

The [Ubuntu Security disclosure and embargo policy](https://ubuntu.com/security/disclosure-policy)
contains more information about what you can expect when you contact us and what we expect from you.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ max-complexity = 10

[tool.black]
line-length = 99
target-version = ["py38", "py310"]
exclude = '''
/(
| .eggs
Expand Down
44 changes: 27 additions & 17 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, *args: Any) -> None:
def exporters(self) -> List[BaseExporter]:
"""Return list of exporters based on detected hardware."""
exporters: List[BaseExporter] = []
stored_tools = self.get_stored_tools()
stored_tools = self.stored_tools
if stored_tools & HardwareExporter.hw_tools():
exporters.append(
HardwareExporter(
Expand All @@ -88,34 +88,44 @@ def exporters(self) -> List[BaseExporter]:

return exporters

def get_stored_tools(self) -> Set[HWTool]:
@property
def stored_tools(self) -> Set[HWTool]:
"""Get the current hardware tools from stored or from machine if not present.
This function stores the current hardware tools as strings because StoredState cannot store
arbitrary objects. HWTool objects can however be re-instantiated from tool names.
Since StoredState cannot store arbitrary objects, re-instantiate tools from tool names.
"""
if not self._stored.stored_tools: # type: ignore[truthy-function]
available_tools = detect_available_tools() # type: ignore[unreachable]
self._stored.stored_tools = {tool.value for tool in available_tools}
if "smartctl" in self._stored.stored_tools: # type: ignore[operator]
self._stored.stored_tools.remove("smartctl") # type: ignore[attr-defined]
self._stored.stored_tools = { # type: ignore[unreachable]
tool.value for tool in detect_available_tools()
}
# remove legacy smartctl tool if present
# See https://github.com/canonical/hardware-observer-operator/pull/327
self._stored.stored_tools.discard("smartctl") # type: ignore[attr-defined]
return {HWTool(value) for value in self._stored.stored_tools} # type: ignore[attr-defined]

@stored_tools.setter
def stored_tools(self, tools: Set[HWTool]) -> None:
"""Record the tools via StoredState.
StoredState cannot store arbitrary objects so we convert Set[HWTool] into Set[str].
This is reversible by re-instantiating tools from tool names.
"""
self._stored.stored_tools = {tool.value for tool in tools}

def _on_redetect_hardware(self, event: ops.ActionEvent) -> None:
"""Redetect available hardware tools and option to rerun the install hook."""
stored_tools = self.get_stored_tools()
available_tools = detect_available_tools()

hw_change_detected = stored_tools != available_tools
hw_change_detected = self.stored_tools != available_tools

sorted_stored_tools = ",".join(map(lambda member: member.value, sorted(stored_tools)))
sorted_stored_tools = ",".join(map(lambda member: member.value, sorted(self.stored_tools)))
sorted_available_tools = ",".join(
map(lambda member: member.value, sorted(available_tools))
)

if event.params["apply"] and hw_change_detected:
# Update the value in local Store
self._stored.stored_tools = {tool.value for tool in available_tools}
self.stored_tools = available_tools
event.log(f"Run install hook with enable tools: {sorted_available_tools}")
self._on_install_or_upgrade(event=event)

Expand All @@ -134,13 +144,13 @@ def _on_install_or_upgrade(self, event: EventBase) -> None:

remove_legacy_smartctl_exporter()

stored_tools = self.get_stored_tools()

msg: str
resource_installed: bool

# Install hw tools
resource_installed, msg = self.hw_tool_helper.install(self.model.resources, stored_tools)
resource_installed, msg = self.hw_tool_helper.install(
self.model.resources, self.stored_tools
)

self._stored.resource_installed = resource_installed
if not resource_installed:
Expand All @@ -167,7 +177,7 @@ def _on_remove(self, _: EventBase) -> None:
# Remove binary tool
self.hw_tool_helper.remove(
self.model.resources,
self.get_stored_tools(),
self.stored_tools,
)
self._stored.resource_installed = False

Expand All @@ -192,7 +202,7 @@ def _on_update_status(self, _: EventBase) -> None: # noqa: C901
self.model.unit.status = BlockedStatus(config_valid_message)
return

hw_tool_ok, error_msg = self.hw_tool_helper.check_installed(self.get_stored_tools())
hw_tool_ok, error_msg = self.hw_tool_helper.check_installed(self.stored_tools)
if not hw_tool_ok:
self.model.unit.status = BlockedStatus(error_msg)
return
Expand Down
54 changes: 27 additions & 27 deletions src/grafana_dashboards/GPU.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -114,7 +114,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}",
Expand All @@ -126,7 +126,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "label_replace(node_hwmon_temp_celsius{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")\n",
Expand All @@ -144,7 +144,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -198,7 +198,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"})",
Expand All @@ -210,7 +210,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "avg(node_hwmon_temp_celsius{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"})",
Expand All @@ -226,7 +226,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -309,7 +309,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}",
Expand All @@ -321,7 +321,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "label_replace(node_hwmon_power_average_watt{agent_hostname=~\"$instance\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")",
Expand All @@ -337,7 +337,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -405,7 +405,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"})",
Expand All @@ -418,7 +418,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"exemplar": false,
Expand Down Expand Up @@ -447,7 +447,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -532,7 +532,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}",
Expand All @@ -544,7 +544,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "label_replace(node_drm_gpu_busy_percent{instance=~\"^$instance.*\"} * on(card) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")\n",
Expand All @@ -560,7 +560,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -645,7 +645,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_FAN_SPEED{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}",
Expand All @@ -657,7 +657,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "label_replace(\n (\n node_hwmon_fan_rpm{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) /\n (\n node_hwmon_fan_max_rpm{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) * 100,\n \"gpu\", \"$1\", \"card\", \"card([0-9]+)\"\n)",
Expand All @@ -673,7 +673,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -757,7 +757,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_MEM_CLOCK{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"} * 1000000",
Expand All @@ -771,7 +771,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "label_replace(node_hwmon_freq_freq_mhz{agent_hostname=~\"^$instance.*\", chip=~\"$amd_gpu\", sensor=\"sclk\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"} * 1000000, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")",
Expand All @@ -787,7 +787,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"fieldConfig": {
"defaults": {
Expand Down Expand Up @@ -872,7 +872,7 @@
"targets": [
{
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_MEM_COPY_UTIL{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}",
Expand All @@ -884,7 +884,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
"uid": "${prometheusds}"
},
"editorMode": "code",
"expr": "label_replace(\n (\n node_drm_memory_vram_used_bytes{instance=~\"^$instance.*\"} * on(card) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) \n / \n (\n node_drm_memory_vram_size_bytes{instance=~\"^$instance.*\"} * on(card) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) * 100,\n \"gpu\", \"$1\", \"card\", \"card([0-9]+)\"\n)",
Expand Down Expand Up @@ -933,7 +933,7 @@
]
},
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"definition": "label_values(node_hwmon_chip_names,agent_hostname)",
"hide": 0,
Expand Down Expand Up @@ -966,7 +966,7 @@
]
},
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)",
"hide": 2,
Expand Down Expand Up @@ -996,7 +996,7 @@
]
},
"datasource": {
"uid": "$datasource"
"uid": "${prometheusds}"
},
"definition": "label_values(node_drm_card_info,chip)",
"hide": 2,
Expand Down
Loading

0 comments on commit 59ec5f1

Please sign in to comment.