Skip to content

Commit

Permalink
Rename most "Gpu" instances to "GPU" (#226)
Browse files Browse the repository at this point in the history
Signed-off-by: Douglas Wightman <[email protected]>
  • Loading branch information
glowkey authored Dec 20, 2023
1 parent b06fd90 commit 56aa52e
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 86 deletions.
6 changes: 3 additions & 3 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ var (
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGpus = "fake-gpus"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
Expand Down Expand Up @@ -163,7 +163,7 @@ func NewApp(buildVersion ...string) *cli.App {
EnvVars: []string{"DCGM_EXPORTER_OTHER_DEVICES_STR"},
},
&cli.BoolFlag{
Name: CLIUseFakeGpus,
Name: CLIUseFakeGPUs,
Value: false,
Usage: "Accept GPUs that are fake, for testing purposes only",
EnvVars: []string{"DCGM_EXPORTER_USE_FAKE_GPUS"},
Expand Down Expand Up @@ -371,7 +371,7 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGpus: c.Bool(CLIUseFakeGpus),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
Expand Down
4 changes: 2 additions & 2 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
)

func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) {
sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGpus, entityType)
sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGPUs, entityType)
if err != nil {
return nil, func() {}, err
}
Expand Down Expand Up @@ -207,7 +207,7 @@ func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, us
return metrics
}

func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric {
func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string) []Metric {
var metrics []Metric
var labels = map[string]string{}

Expand Down
4 changes: 2 additions & 2 deletions pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
GPUDevices: dOpt,
NoHostname: false,
UseOldNamespace: false,
UseFakeGpus: false,
UseFakeGPUs: false,
}

dcgmGetAllDeviceCount = func() (uint, error) {
Expand Down Expand Up @@ -152,7 +152,7 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
CPUDevices: dOpt,
NoHostname: false,
UseOldNamespace: false,
UseFakeGpus: false,
UseFakeGPUs: false,
}

dcgmGetAllDeviceCount = func() (uint, error) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/dcgmexporter/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo
if strings.HasPrefix(deviceid, MIG_UUID_PREFIX) {
gpuUuid, gi, _, err := nvml.ParseMigDeviceUUID(deviceid)
if err == nil {
giIdentifier := GetGpuInstanceIdentifier(sysInfo, gpuUuid, gi)
giIdentifier := GetGPUInstanceIdentifier(sysInfo, gpuUuid, gi)
deviceToPodMap[giIdentifier] = podInfo
} else {
gpuUuid = deviceid[len(MIG_UUID_PREFIX):]
Expand Down
132 changes: 66 additions & 66 deletions pkg/dcgmexporter/system_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,16 @@ type ComputeInstanceInfo struct {
EntityId uint
}

type GpuInstanceInfo struct {
type GPUInstanceInfo struct {
Info dcgm.MigEntityInfo
ProfileName string
EntityId uint
ComputeInstances []ComputeInstanceInfo
}

type GpuInfo struct {
type GPUInfo struct {
DeviceInfo dcgm.Device
GpuInstances []GpuInstanceInfo
GPUInstances []GPUInstanceInfo
MigEnabled bool
}

Expand All @@ -71,8 +71,8 @@ type CPUInfo struct {
}

type SystemInfo struct {
GpuCount uint
Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo
GPUCount uint
GPUs [dcgm.MAX_NUM_DEVICES]GPUInfo
gOpt DeviceOptions
sOpt DeviceOptions
cOpt DeviceOptions
Expand All @@ -84,15 +84,15 @@ type SystemInfo struct {
type MonitoringInfo struct {
Entity dcgm.GroupEntityPair
DeviceInfo dcgm.Device
InstanceInfo *GpuInstanceInfo
InstanceInfo *GPUInstanceInfo
ParentId uint
}

func SetGpuInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool {
for i := uint(0); i < sysInfo.GpuCount; i++ {
for j := range sysInfo.Gpus[i].GpuInstances {
if sysInfo.Gpus[i].GpuInstances[j].EntityId == entityId {
sysInfo.Gpus[i].GpuInstances[j].ProfileName = profileName
func SetGPUInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool {
for i := uint(0); i < sysInfo.GPUCount; i++ {
for j := range sysInfo.GPUs[i].GPUInstances {
if sysInfo.GPUs[i].GPUInstances[j].EntityId == entityId {
sysInfo.GPUs[i].GPUInstances[j].ProfileName = profileName
return true
}
}
Expand All @@ -105,7 +105,7 @@ func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error
notFound := false
err := fmt.Errorf("Cannot find match for entities:")
for _, v := range values {
found := SetGpuInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v))
found := SetGPUInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v))
if found == false {
err = fmt.Errorf("%s group %d, id %d", err, v.EntityGroupId, v.EntityId)
notFound = true
Expand Down Expand Up @@ -137,9 +137,9 @@ func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPai
return SetMigProfileNames(sysInfo, values)
}

func GpuIdExists(sysInfo *SystemInfo, gpuId int) bool {
for i := uint(0); i < sysInfo.GpuCount; i++ {
if sysInfo.Gpus[i].DeviceInfo.GPU == uint(gpuId) {
func GPUIdExists(sysInfo *SystemInfo, gpuId int) bool {
for i := uint(0); i < sysInfo.GPUCount; i++ {
if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuId) {
return true
}
}
Expand All @@ -164,9 +164,9 @@ func CPUIdExists(sysInfo *SystemInfo, cpuId int) bool {
return false
}

func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool {
for i := uint(0); i < sysInfo.GpuCount; i++ {
for _, instance := range sysInfo.Gpus[i].GpuInstances {
func GPUInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool {
for i := uint(0); i < sysInfo.GPUCount; i++ {
for _, instance := range sysInfo.GPUs[i].GPUInstances {
if instance.EntityId == uint(gpuInstanceId) {
return true
}
Expand Down Expand Up @@ -255,15 +255,15 @@ func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error {
if len(gOpt.MajorRange) > 0 && gOpt.MajorRange[0] != -1 {
// Verify we can find all the specified GPUs
for _, gpuId := range gOpt.MajorRange {
if GpuIdExists(sysInfo, gpuId) == false {
if GPUIdExists(sysInfo, gpuId) == false {
return fmt.Errorf("Couldn't find requested GPU id %d", gpuId)
}
}
}

if len(gOpt.MinorRange) > 0 && gOpt.MinorRange[0] != -1 {
for _, gpuInstanceId := range gOpt.MinorRange {
if GpuInstanceIdExists(sysInfo, gpuInstanceId) == false {
if GPUInstanceIdExists(sysInfo, gpuInstanceId) == false {
return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceId)
}
}
Expand Down Expand Up @@ -359,21 +359,21 @@ func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo,
return sysInfo, nil
}

func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool) (SystemInfo, error) {
func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) (SystemInfo, error) {
gpuCount, err := dcgmGetAllDeviceCount()
if err != nil {
return sysInfo, err
}
sysInfo.GpuCount = gpuCount
sysInfo.GPUCount = gpuCount

for i := uint(0); i < sysInfo.GpuCount; i++ {
for i := uint(0); i < sysInfo.GPUCount; i++ {
// Default mig enabled to false
sysInfo.Gpus[i].MigEnabled = false
sysInfo.Gpus[i].DeviceInfo, err = dcgmGetDeviceInfo(i)
sysInfo.GPUs[i].MigEnabled = false
sysInfo.GPUs[i].DeviceInfo, err = dcgmGetDeviceInfo(i)
if err != nil {
if useFakeGpus {
sysInfo.Gpus[i].DeviceInfo.GPU = i
sysInfo.Gpus[i].DeviceInfo.UUID = fmt.Sprintf("fake%d", i)
if useFakeGPUs {
sysInfo.GPUs[i].DeviceInfo.GPU = i
sysInfo.GPUs[i].DeviceInfo.UUID = fmt.Sprintf("fake%d", i)
} else {
return sysInfo, err
}
Expand All @@ -395,20 +395,20 @@ func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool)
// We are adding a GPU instance
gpuId = hierarchy.EntityList[i].Parent.EntityId
entityId := hierarchy.EntityList[i].Entity.EntityId
instanceInfo := GpuInstanceInfo{
instanceInfo := GPUInstanceInfo{
Info: hierarchy.EntityList[i].Info,
ProfileName: "",
EntityId: entityId,
}
sysInfo.Gpus[gpuId].MigEnabled = true
sysInfo.Gpus[gpuId].GpuInstances = append(sysInfo.Gpus[gpuId].GpuInstances, instanceInfo)
sysInfo.GPUs[gpuId].MigEnabled = true
sysInfo.GPUs[gpuId].GPUInstances = append(sysInfo.GPUs[gpuId].GPUInstances, instanceInfo)
entities = append(entities, dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId})
instanceIndex = len(sysInfo.Gpus[gpuId].GpuInstances) - 1
instanceIndex = len(sysInfo.GPUs[gpuId].GPUInstances) - 1
} else if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU_I {
// Add the compute instance, gpuId is recorded previously
entityId := hierarchy.EntityList[i].Entity.EntityId
ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityId}
sysInfo.Gpus[gpuId].GpuInstances[instanceIndex].ComputeInstances = append(sysInfo.Gpus[gpuId].GpuInstances[instanceIndex].ComputeInstances, ciInfo)
sysInfo.GPUs[gpuId].GPUInstances[instanceIndex].ComputeInstances = append(sysInfo.GPUs[gpuId].GPUInstances[instanceIndex].ComputeInstances, ciInfo)
}
}

Expand All @@ -424,7 +424,7 @@ func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool)
return sysInfo, err
}

func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOptions, useFakeGpus bool, entityType dcgm.Field_Entity_Group) (SystemInfo, error) {
func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOptions, useFakeGPUs bool, entityType dcgm.Field_Entity_Group) (SystemInfo, error) {
sysInfo := SystemInfo{}

logrus.Info("Initializing system entities of type: ", entityType)
Expand All @@ -437,7 +437,7 @@ func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOpt
return InitializeNvSwitchInfo(sysInfo, sOpt)
case dcgm.FE_GPU:
sysInfo.InfoType = dcgm.FE_GPU
return InitializeGpuInfo(sysInfo, gOpt, useFakeGpus)
return InitializeGPUInfo(sysInfo, gOpt, useFakeGPUs)
case dcgm.FE_CPU:
sysInfo.InfoType = dcgm.FE_CPU
return InitializeCPUInfo(sysInfo, cOpt)
Expand Down Expand Up @@ -541,13 +541,13 @@ func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), er
return groupId, func() { dcgm.DestroyGroup(groupId) }, nil
}

func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo {
func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo {
var monitoring []MonitoringInfo

for i := uint(0); i < sysInfo.GpuCount; i++ {
for i := uint(0); i < sysInfo.GPUCount; i++ {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
sysInfo.Gpus[i].DeviceInfo,
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU},
sysInfo.GPUs[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
}
Expand Down Expand Up @@ -759,24 +759,24 @@ func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo {
return monitoring
}

func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
var monitoring []MonitoringInfo

for i := uint(0); i < sysInfo.GpuCount; i++ {
if addFlexibly == true && len(sysInfo.Gpus[i].GpuInstances) == 0 {
for i := uint(0); i < sysInfo.GPUCount; i++ {
if addFlexibly == true && len(sysInfo.GPUs[i].GPUInstances) == 0 {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
sysInfo.Gpus[i].DeviceInfo,
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU},
sysInfo.GPUs[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
}
monitoring = append(monitoring, mi)
} else {
for j := 0; j < len(sysInfo.Gpus[i].GpuInstances); j++ {
for j := 0; j < len(sysInfo.GPUs[i].GPUInstances); j++ {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId},
sysInfo.Gpus[i].DeviceInfo,
&sysInfo.Gpus[i].GpuInstances[j],
dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.GPUs[i].GPUInstances[j].EntityId},
sysInfo.GPUs[i].DeviceInfo,
&sysInfo.GPUs[i].GPUInstances[j],
PARENT_ID_IGNORED,
}
monitoring = append(monitoring, mi)
Expand All @@ -787,12 +787,12 @@ func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
return monitoring
}

func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GpuCount; i++ {
if sysInfo.Gpus[i].DeviceInfo.GPU == uint(gpuId) {
func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuId int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GPUCount; i++ {
if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuId) {
return &MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
sysInfo.Gpus[i].DeviceInfo,
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU},
sysInfo.GPUs[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
}
Expand All @@ -802,13 +802,13 @@ func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo {
return nil
}

func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GpuCount; i++ {
for _, instance := range sysInfo.Gpus[i].GpuInstances {
func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GPUCount; i++ {
for _, instance := range sysInfo.GPUs[i].GPUInstances {
if instance.EntityId == uint(gpuInstanceId) {
return &MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU_I, uint(gpuInstanceId)},
sysInfo.Gpus[i].DeviceInfo,
sysInfo.GPUs[i].DeviceInfo,
&instance,
PARENT_ID_IGNORED,
}
Expand All @@ -831,34 +831,34 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo {
} else if sysInfo.InfoType == dcgm.FE_CPU_CORE {
monitoring = AddAllCPUCores(sysInfo)
} else if sysInfo.gOpt.Flex == true {
monitoring = AddAllGpuInstances(sysInfo, true)
monitoring = AddAllGPUInstances(sysInfo, true)
} else {
if len(sysInfo.gOpt.MajorRange) > 0 && sysInfo.gOpt.MajorRange[0] == -1 {
monitoring = AddAllGpus(sysInfo)
monitoring = AddAllGPUs(sysInfo)
} else {
for _, gpuId := range sysInfo.gOpt.MajorRange {
// We've already verified that everything in the options list exists
monitoring = append(monitoring, *GetMonitoringInfoForGpu(sysInfo, gpuId))
monitoring = append(monitoring, *GetMonitoringInfoForGPU(sysInfo, gpuId))
}
}

if len(sysInfo.gOpt.MinorRange) > 0 && sysInfo.gOpt.MinorRange[0] == -1 {
monitoring = AddAllGpuInstances(sysInfo, false)
monitoring = AddAllGPUInstances(sysInfo, false)
} else {
for _, gpuInstanceId := range sysInfo.gOpt.MinorRange {
// We've already verified that everything in the options list exists
monitoring = append(monitoring, *GetMonitoringInfoForGpuInstance(sysInfo, gpuInstanceId))
monitoring = append(monitoring, *GetMonitoringInfoForGPUInstance(sysInfo, gpuInstanceId))
}
}
}

return monitoring
}

func GetGpuInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string {
for i := uint(0); i < sysInfo.GpuCount; i++ {
if sysInfo.Gpus[i].DeviceInfo.UUID == gpuuuid {
identifier := fmt.Sprintf("%d-%d", sysInfo.Gpus[i].DeviceInfo.GPU, gpuInstanceId)
func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string {
for i := uint(0); i < sysInfo.GPUCount; i++ {
if sysInfo.GPUs[i].DeviceInfo.UUID == gpuuuid {
identifier := fmt.Sprintf("%d-%d", sysInfo.GPUs[i].DeviceInfo.GPU, gpuInstanceId)
return identifier
}
}
Expand Down
Loading

0 comments on commit 56aa52e

Please sign in to comment.