From ee57d650a7692c1ac03f74ac073d0fd150c9f030 Mon Sep 17 00:00:00 2001 From: Maciej Kempin Date: Mon, 24 Jul 2023 13:45:56 +0100 Subject: [PATCH] Override defaults if GPU (#664) * first commit * missing comma * missing function * simplification * updates tables * logging no 1 * more debug * fixing things and log them * fixing * accelerator type is just a string not url * Revert "accelerator type is just a string not url" This reverts commit c09b6fe13a7d5b480922c9e7fd409c8b4c5ee8a3. * more logging and stuff * more logs * typo * more and more logging * use zone instead of region * styles * do no stack accelerator type * removing debug * always load defaults for gpu plan * forgotten line * Some nice logs * pasing only count * adding statement * disk spart way * cleanup * do not assign VMsize if gpu VM Type * Add gpu to query tags (#670) * added gpu to the tags list for quering images in api-selector * added debug lines * add gpu_vm_type to params in api selector * removed debug lines * Update CHANGELOG.md --- CHANGELOG.md | 3 ++ backend/gce.go | 110 +++++++++++++++++++++++++++++++++++++----- image/api_selector.go | 8 +++ image/params.go | 1 + 4 files changed, 109 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a698364c..d0069e0da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/). ### Removed +### Added +- Adding GPU Support + ### Fixed ## [6.2.4] - 2019-10-29 diff --git a/backend/gce.go b/backend/gce.go index faafe93b7..c4bc91dda 100644 --- a/backend/gce.go +++ b/backend/gce.go @@ -151,9 +151,20 @@ Set-LocalUser -Name travis -Password $pw "large": "n2-standard-4", "x-large": "n2-standard-8", "2x-large": "n2-standard-16", + "gpu-medium": "n1-standard-8", + "gpu-xlarge": "n1-standard-8", } ) +func stringInSlice(a string, list []string) bool { + for _, b := range list { + if b == a { + return true + } + } + return false +} + type gceStartupScriptData struct { AutoImplode bool HardTimeoutMinutes int64 @@ -180,6 +191,53 @@ func (oe *gceOpError) Error() string { return strings.Join(errStrs, ", ") } +type singleGpuMapping struct { + GpuCount int64 + GpuType string + DiskSize int64 +} + +var gpuMedium = singleGpuMapping{ + GpuCount: 1, + GpuType: "nvidia-tesla-t4", + DiskSize: 300,} +var gpuXLarge = singleGpuMapping{ + GpuCount: 1, + GpuType: "nvidia-tesla-v100", + DiskSize: 300,} + +func GpuMapping(vmSize string) (value singleGpuMapping) { + gpuMapping := map[string] singleGpuMapping{ + "gpu-medium": gpuMedium, + "gpu-xlarge": gpuXLarge, + } + return gpuMapping[vmSize] +} + + +func GpuDefaultGpuCount(vmSize string) (gpuCountInt int64) { + return GpuMapping(vmSize).GpuCount +} + +func GpuDefaultGpuDiskSize(vmSize string) (gpuDiskSizeInt int64) { + return GpuMapping(vmSize).DiskSize +} + +func GpuDefaultGpuType(vmSize string) (gpuTypeString string) { + return GpuMapping(vmSize).GpuType +} + +func GPUType(varSize string) string { + switch varSize { + case "gpu-medium": + return "gpu-medium" + case "gpu-xlarge": + return "gpu-xlarge" + default: + return "" + } +} + type gceAccountJSON struct { ClientEmail string `json:"client_email"` PrivateKey string `json:"private_key"` @@ -827,7 +885,9 @@ func (p *gceProvider) Setup(ctx gocontext.Context) error { machineTypes := []string{p.ic.MachineType, p.ic.PremiumMachineType} for _, machineType := range gceVMSizeMapping { - machineTypes = append(machineTypes, machineType); + if !stringInSlice(machineType, machineTypes) { + machineTypes = append(machineTypes, machineType); + } } for _, zoneName := range append(zoneNames, p.alternateZones...) { for _, machineType := range machineTypes { @@ -1421,6 +1481,7 @@ func (p *gceProvider) imageSelect(ctx gocontext.Context, startAttributes *StartA jobID, _ := context.JobIDFromContext(ctx) repo, _ := context.RepositoryFromContext(ctx) + var gpuVMType = GPUType(startAttributes.VMSize) if startAttributes.ImageName != "" { imageName = startAttributes.ImageName @@ -1434,6 +1495,7 @@ func (p *gceProvider) imageSelect(ctx gocontext.Context, startAttributes *StartA OS: startAttributes.OS, JobID: jobID, Repo: repo, + GpuVMType: gpuVMType, }) if err != nil { @@ -1485,11 +1547,31 @@ func (p *gceProvider) buildInstance(ctx gocontext.Context, c *gceStartContext) ( Zone: c.zoneName, } + var gpuVMType = GPUType(c.startAttributes.VMSize) + + machineType := p.ic.MachineType + if c.startAttributes.VMType == "premium" { + c.startAttributes.VMSize = "premium" + machineType = p.ic.PremiumMachineType + } else if c.startAttributes.VMSize != "" { + if mtype, ok := gceVMSizeMapping[c.startAttributes.VMSize]; ok { + machineType = mtype; + //storing converted machine type for instance size identification + if gpuVMType == "" { + c.startAttributes.VMSize = machineType + } + } + } + diskSize := p.ic.DiskSize if c.startAttributes.OS == "windows" { diskSize = p.ic.DiskSizeWindows } + if gpuVMType != "" { + diskSize = GpuDefaultGpuDiskSize(gpuVMType) + } + diskInitParams := &compute.AttachedDiskInitializeParams{ SourceImage: c.image.SelfLink, DiskType: gcePdSSDForZone(c.zoneName), @@ -1506,18 +1588,6 @@ func (p *gceProvider) buildInstance(ctx gocontext.Context, c *gceStartContext) ( }, } - machineType := p.ic.MachineType - if c.startAttributes.VMType == "premium" { - c.startAttributes.VMSize = "premium" - machineType = p.ic.PremiumMachineType - } else if c.startAttributes.VMSize != "" { - if mtype, ok := gceVMSizeMapping[c.startAttributes.VMSize]; ok { - machineType = mtype; - //storing converted machine type for instance size identification - c.startAttributes.VMSize = machineType - } - } - var ok bool inst.MachineType, ok = p.machineTypeSelfLinks[gceMtKey(c.zoneName, machineType)] if !ok { @@ -1532,6 +1602,19 @@ func (p *gceProvider) buildInstance(ctx gocontext.Context, c *gceStartContext) ( p.projectID, c.startAttributes.VMConfig.Zone, c.startAttributes.VMConfig.GpuType) + } else if gpuVMType != "" { + logger.WithField("acceleratorConfig.AcceleratorType", acceleratorConfig.AcceleratorType).Debug("Setting AcceleratorConfig") + if !strings.HasPrefix(acceleratorConfig.AcceleratorType, "https") { + notUrlAcceleratorType := GpuDefaultGpuType(gpuVMType) + logger.WithField("notUrlAcceleratorType", notUrlAcceleratorType).Debug("Retrieving AcceleratorType from defaults") + logger.WithField("AcceleratorCount", p.ic.AcceleratorConfig.AcceleratorCount).Debug("Retrieving AcceleratorCount from defaults") + acceleratorConfig.AcceleratorCount = GpuDefaultGpuCount(gpuVMType) + acceleratorConfig.AcceleratorType = fmt.Sprintf("https://www.googleapis.com/compute/v1/projects/%s/zones/%s/acceleratorTypes/%s", + p.projectID, + c.zoneName, + notUrlAcceleratorType) + logger.WithField("acceleratorConfig.AcceleratorType", acceleratorConfig.AcceleratorType).Debug("Url for Accelerator Type is:") + } } var subnetwork string @@ -1595,6 +1678,7 @@ func (p *gceProvider) buildInstance(ctx gocontext.Context, c *gceStartContext) ( } inst.GuestAccelerators = []*compute.AcceleratorConfig{} + if acceleratorConfig.AcceleratorCount > 0 { logger.Debug("GPU requested, setting acceleratorConfig") inst.GuestAccelerators = append(inst.GuestAccelerators, acceleratorConfig) diff --git a/image/api_selector.go b/image/api_selector.go index 436717308..e6f440b17 100644 --- a/image/api_selector.go +++ b/image/api_selector.go @@ -119,6 +119,7 @@ func (as *APISelector) queryWithTags(ctx gocontext.Context, infra string, tags [ bodyLines := []string{} lastJobID := uint64(0) lastRepo := "" + gpuVMType := "" for _, ts := range tags { qs := url.Values{} @@ -127,6 +128,7 @@ func (as *APISelector) queryWithTags(ctx gocontext.Context, infra string, tags [ qs.Set("limit", "1") qs.Set("job_id", fmt.Sprintf("%v", ts.JobID)) qs.Set("repo", ts.Repo) + qs.Set("gpu_vm_type", ts.GpuVMType) qs.Set("is_default", fmt.Sprintf("%v", ts.IsDefault)) if len(ts.Tags) > 0 { qs.Set("tags", strings.Join(ts.Tags, ",")) @@ -135,6 +137,7 @@ func (as *APISelector) queryWithTags(ctx gocontext.Context, infra string, tags [ bodyLines = append(bodyLines, qs.Encode()) lastJobID = ts.JobID lastRepo = ts.Repo + gpuVMType = ts.GpuVMType } qs := url.Values{} @@ -144,6 +147,7 @@ func (as *APISelector) queryWithTags(ctx gocontext.Context, infra string, tags [ qs.Set("limit", "1") qs.Set("job_id", fmt.Sprintf("%v", lastJobID)) qs.Set("repo", lastRepo) + qs.Set("gpu_vm_type", gpuVMType) bodyLines = append(bodyLines, qs.Encode()) @@ -233,6 +237,7 @@ type tagSet struct { JobID uint64 Repo string + GpuVMType string } func (ts *tagSet) GoString() string { @@ -244,6 +249,7 @@ func (as *APISelector) buildCandidateTags(params *Params) ([]*tagSet, error) { Tags: []string{}, JobID: params.JobID, Repo: params.Repo, + GpuVMType: params.GpuVMType, } candidateTags := []*tagSet{} @@ -255,6 +261,7 @@ func (as *APISelector) buildCandidateTags(params *Params) ([]*tagSet, error) { Tags: []string{tag}, JobID: params.JobID, Repo: params.Repo, + GpuVMType: params.GpuVMType, }) } @@ -265,6 +272,7 @@ func (as *APISelector) buildCandidateTags(params *Params) ([]*tagSet, error) { Tags: tags, JobID: params.JobID, Repo: params.Repo, + GpuVMType: params.GpuVMType, }) } diff --git a/image/params.go b/image/params.go index df3f0a58f..dc9c217ad 100644 --- a/image/params.go +++ b/image/params.go @@ -10,4 +10,5 @@ type Params struct { JobID uint64 Repo string + GpuVMType string }