Skip to content

Commit

Permalink
Merge pull request #38 from run-ai/revert-37-shaibi/RUN-7801-mig-bugf…
Browse files Browse the repository at this point in the history
…ix-2

Revert "RUN-7801 MIG Faker bugfix"
  • Loading branch information
SaraNachmias authored Mar 14, 2023
2 parents 86d71b3 + c6c5811 commit 07f6e03
Show file tree
Hide file tree
Showing 15 changed files with 50 additions and 174 deletions.
10 changes: 0 additions & 10 deletions internal/common/kubeclient/kubeclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (
type KubeClientInterface interface {
SetNodeLabels(lables map[string]string) error
SetNodeAnnotations(annotations map[string]string) error
GetNodeLabels() (map[string]string, error)
WatchConfigMap(namespace string, configmapName string) (chan *corev1.ConfigMap, error)
GetConfigMap(namespace string, configmapName string) (*corev1.ConfigMap, bool)
}
Expand Down Expand Up @@ -72,15 +71,6 @@ func (client *KubeClient) SetNodeAnnotations(annotations map[string]string) erro
return err
}

func (client *KubeClient) GetNodeLabels() (map[string]string, error) {
nodeName := viper.GetString("NODE_NAME")
node, err := client.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
if err != nil {
return nil, err
}
return node.Labels, nil
}

func (client *KubeClient) GetConfigMap(namespace string, configmapName string) (*corev1.ConfigMap, bool) {
cm, err := client.ClientSet.CoreV1().ConfigMaps(
namespace).Get(
Expand Down
5 changes: 0 additions & 5 deletions internal/common/kubeclient/kubeclient_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
type KubeClientMock struct {
ActualSetNodeLabels func(labels map[string]string)
ActualSetNodeAnnotations func(annotations map[string]string)
ActualGetNodeLabels func() (map[string]string, error)
ActualWatchConfigMap func(namespace string, configmapName string)
}

Expand All @@ -22,10 +21,6 @@ func (client *KubeClientMock) SetNodeLabels(labels map[string]string) error {
return nil
}

func (client *KubeClientMock) GetNodeLabels() (map[string]string, error) {
return client.ActualGetNodeLabels()
}

func (client *KubeClientMock) WatchConfigMap(namespace string, configmapName string) (chan *corev1.ConfigMap, error) {
client.ActualWatchConfigMap(namespace, configmapName)
return nil, nil
Expand Down
4 changes: 4 additions & 0 deletions internal/migfaker/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ type MigFakeApp struct {

func (app *MigFakeApp) Run() {
ContinuouslySyncMigConfigChanges(app.KubeClient.ClientSet, app.SyncableMigConfig, app.stopCh)
err := app.MigFaker.FakeNodeLabels()
if err != nil {
log.Fatalf("Error faking node labels: %e", err)
}
for {
select {
case <-app.stopCh:
Expand Down
105 changes: 20 additions & 85 deletions internal/migfaker/migfaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ import (
"encoding/json"
"fmt"
"log"
"strconv"
"strings"

"github.com/google/uuid"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/kubeclient"
)

var fakeLables = map[string]string{
"feature.node.kubernetes.io/pci-10de.present": "true",
"node-role.kubernetes.io/runai-dynamic-mig": "true",
"node-role.kubernetes.io/runai-mig-enabled": "true",
}

var GenerateUuid = uuid.New

type MigFaker struct {
Expand All @@ -25,33 +28,23 @@ func NewMigFaker(kubeclient kubeclient.KubeClientInterface) *MigFaker {
}
}

func (faker *MigFaker) FakeMapping(config *MigConfigs) error {
mappings := MigMapping{}
for _, selectedDevice := range config.SelectedDevices {
if len(selectedDevice.Devices) == 0 {
continue
}

gpuIdx, err := strconv.Atoi(selectedDevice.Devices[0])
if err != nil {
return fmt.Errorf("failed to parse gpu index %s: %w", selectedDevice.Devices[0], err)
}

migDeviceMappingInfo, err := faker.getGpuMigDeviceMappingInfo(selectedDevice)
if err != nil {
return fmt.Errorf("failed to get gpu mig device mapping info: %w", err)
}
func (faker *MigFaker) FakeNodeLabels() error {
return faker.kubeclient.SetNodeLabels(fakeLables)
}

mappings[gpuIdx] = migDeviceMappingInfo
func (faker *MigFaker) FakeMapping(config *MigConfigs) error {
mappings := map[string]map[string]string{}
for id, selectedDevice := range config.SelectedDevices {
mappings[fmt.Sprint(id)] = faker.copyMigDevices(selectedDevice)
}

smappings, _ := json.Marshal(mappings)

labels := map[string]string{
constants.MigConfigStateLabel: "success",
"nvidia.com/mig.config.state": "success",
}
annotations := map[string]string{
constants.MigMappingAnnotation: base64.StdEncoding.EncodeToString(smappings),
"run.ai/mig-mapping": base64.StdEncoding.EncodeToString(smappings),
}

err := faker.kubeclient.SetNodeLabels(labels)
Expand All @@ -67,68 +60,10 @@ func (faker *MigFaker) FakeMapping(config *MigConfigs) error {
return nil
}

func (faker *MigFaker) getGpuMigDeviceMappingInfo(devices SelectedDevices) ([]MigDeviceMappingInfo, error) {
gpuProduct, err := faker.getGpuProduct()
if err != nil {
return nil, fmt.Errorf("failed to get gpu product: %w", err)
}

migDevices := []MigDeviceMappingInfo{}
for _, migDevice := range devices.MigDevices {
gpuInstanceId, err := migInstanceNameToGpuInstanceId(gpuProduct, migDevice.Name)
if err != nil {
return nil, fmt.Errorf("failed to get gpu instance id: %w", err)
}
migDevices = append(migDevices, MigDeviceMappingInfo{
Position: migDevice.Position,
DeviceUUID: fmt.Sprintf("MIG-%s", GenerateUuid()),
GpuInstanceId: gpuInstanceId,
})
}

return migDevices, nil
}

func (faker *MigFaker) getGpuProduct() (string, error) {
nodeLabels, err := faker.kubeclient.GetNodeLabels()
if err != nil {
return "", fmt.Errorf("failed to get node labels: %w", err)
func (*MigFaker) copyMigDevices(devices SelectedDevices) map[string]string {
migDevices := map[string]string{}
for key := range devices.MigDevices {
migDevices[key] = fmt.Sprintf("MIG-%s", GenerateUuid())
}

return nodeLabels[constants.GpuProductLabel], nil
}

func migInstanceNameToGpuInstanceId(gpuProduct string, migInstanceName string) (int, error) {
var gpuInstanceId int
var ok bool
switch {
case strings.Contains(gpuProduct, "40GB"):
gpuInstanceId, ok = map[string]int{
"1g.5gb": 19,
"1g.5gb+me": 20,
"1g.10gb": 15,
"2g.10gb": 14,
"3g.20gb": 9,
"4g.20gb": 5,
"7g.40gb": 0,
}[migInstanceName]
case strings.Contains(gpuProduct, "80GB"):
gpuInstanceId, ok = map[string]int{
"1g.10gb": 19,
"1g.10gb+me": 20,
"1g.20gb": 15,
"2g.20gb": 14,
"3g.40gb": 9,
"4g.40gb": 5,
"7g.80gb": 0,
}[migInstanceName]
default:
return -1, fmt.Errorf("gpuProduct %s not supported", gpuProduct)
}

if !ok {
return -1, fmt.Errorf("failed mapping mig instance name %s to gpu instance id", migInstanceName)
}

return gpuInstanceId, nil
return migDevices
}
32 changes: 4 additions & 28 deletions internal/migfaker/migfaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@ import (
"testing"

"encoding/base64"
"encoding/json"

"github.com/google/uuid"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/kubeclient"
"github.com/run-ai/fake-gpu-operator/internal/migfaker"
"github.com/stretchr/testify/assert"
Expand All @@ -21,12 +19,8 @@ func TestFakeMapping(t *testing.T) {
{
Devices: []string{"0"},
MigEnabled: true,
MigDevices: []migfaker.MigDevice{
{
Name: "4g.20gb",
Position: 0,
Size: 4,
},
MigDevices: map[string]string{
"4": uid.String(),
},
},
},
Expand All @@ -36,29 +30,11 @@ func TestFakeMapping(t *testing.T) {
kubeClientMock.ActualSetNodeLabels = func(labels map[string]string) {
assert.Equal(t, labels["nvidia.com/mig.config.state"], "success")
}
kubeClientMock.ActualGetNodeLabels = func() (map[string]string, error) {
return map[string]string{
constants.GpuProductLabel: "NVIDIA-A100-SXM4-40GB",
}, nil
}

kubeClientMock.ActualSetNodeAnnotations = func(labels map[string]string) {
b64mapping := labels["run.ai/mig-mapping"]
actualMappingJson, _ := base64.StdEncoding.DecodeString(b64mapping)

expectedMapping := migfaker.MigMapping{
0: []migfaker.MigDeviceMappingInfo{
{
Position: 0,
DeviceUUID: fmt.Sprintf("MIG-%s", uid),
GpuInstanceId: 5,
},
},
}
expectedMappingJson, err := json.Marshal(expectedMapping)

assert.NoError(t, err)
assert.JSONEq(t, string(expectedMappingJson), string(actualMappingJson))
mapping, _ := base64.StdEncoding.DecodeString(b64mapping)
assert.JSONEq(t, string(mapping), fmt.Sprintf(`{"0":{"4":"MIG-%s"}}`, uid))
}

migFaker := migfaker.NewMigFaker(kubeClientMock)
Expand Down
24 changes: 3 additions & 21 deletions internal/migfaker/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,12 @@ type AnnotationMigConfig struct {
MigConfigs MigConfigs `yaml:"mig-configs"`
}

// A copy of github.com/run-ai/runai-operator/mig-parted/api/spec/v1.Spec
// (not imported to reduce dependencies)
type MigConfigs struct {
SelectedDevices []SelectedDevices `yaml:"selected"`
}

type SelectedDevices struct {
Devices []string `yaml:"devices"`
MigEnabled bool `yaml:"mig-enabled"`
MigDevices []MigDevice `yaml:"mig-devices"`
}

type MigDevice struct {
Name string `yaml:"name"`
Position int `yaml:"position"`
Size int `yaml:"size"`
}

// A copy of github.com/run-ai/runai-operator/mig-provisioner/pkg/node.MigMapping
// (not imported to reduce dependencies)
type MigMapping map[int][]MigDeviceMappingInfo

type MigDeviceMappingInfo struct {
Position int `json:"position"`
DeviceUUID string `json:"device_uuid"`
GpuInstanceId int `json:"gpu_instance_id"`
Devices []string `yaml:"devices"`
MigEnabled bool `yaml:"mig-enabled"`
MigDevices map[string]string `yaml:"mig-devices"`
}
18 changes: 8 additions & 10 deletions internal/status-exporter/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,10 @@ func getTestCases() map[string]testCase {
},
},
expectedLabels: map[string]string{
"feature.node.kubernetes.io/pci-10de.present": "true",
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "1",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "1",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
},
expectedMetrics: []*dto.MetricFamily{
{
Expand Down Expand Up @@ -309,11 +308,10 @@ func getTestCases() map[string]testCase {
},
},
expectedLabels: map[string]string{
"feature.node.kubernetes.io/pci-10de.present": "true",
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "2",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "2",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
},
expectedMetrics: []*dto.MetricFamily{
{
Expand Down
2 changes: 1 addition & 1 deletion internal/status-exporter/export/fs/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import (
"path/filepath"
"strconv"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-exporter/export"
"github.com/run-ai/fake-gpu-operator/internal/status-exporter/watch"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/spf13/viper"
)

Expand Down
9 changes: 4 additions & 5 deletions internal/status-exporter/export/labels/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,10 @@ func (e *LabelsExporter) export(clusterTopology *topology.Cluster) {
}

labels := map[string]string{
"nvidia.com/gpu.memory": strconv.Itoa(node.GpuMemory),
"nvidia.com/gpu.product": node.GpuProduct,
"nvidia.com/mig.strategy": clusterTopology.MigStrategy,
"nvidia.com/gpu.count": strconv.Itoa(len(node.Gpus)),
"feature.node.kubernetes.io/pci-10de.present": "true",
"nvidia.com/gpu.memory": strconv.Itoa(node.GpuMemory),
"nvidia.com/gpu.product": node.GpuProduct,
"nvidia.com/mig.strategy": clusterTopology.MigStrategy,
"nvidia.com/gpu.count": strconv.Itoa(len(node.Gpus)),
}

err := e.kubeclient.SetNodeLabels(labels)
Expand Down
2 changes: 1 addition & 1 deletion internal/status-updater/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ import (
"github.com/google/uuid"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"gopkg.in/yaml.v3"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@ const (
GpuFractionAnnotation = "gpu-fraction"
PodGroupNameAnnotation = "pod-group-name"
ReservationPodGpuIdxAnnotation = "run.ai/reserve_for_gpu_index"
MigMappingAnnotation = "run.ai/mig-mapping"

GpuGroupLabel = "runai-pod-group"
GpuProductLabel = "nvidia.com/gpu.product"
MigConfigStateLabel = "nvidia.com/mig.config.state"
GpuGroupLabel = "runai-pod-group"

ReservationNs = "runai-reservation"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import (
"fmt"
"log"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/util"
v1 "k8s.io/api/core/v1"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ import (
"strconv"
"strings"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/util"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ import (
"strconv"

"github.com/hashicorp/go-multierror"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/util"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down
Loading

0 comments on commit 07f6e03

Please sign in to comment.