Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nfd refractoring #55

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
22b938a
adding nfd config change utill
Dec 25, 2024
ad921a7
NFD vars seperated in gpu and nno
AdamKaabyia Jan 23, 2025
ea554a7
NFD consts seperated in gpu and nno
AdamKaabyia Jan 23, 2025
49e6673
1.nfd package created
AdamKaabyia Jan 23, 2025
b4f7680
1.common consts added to nfd consts even tho theyre not nfd
AdamKaabyia Jan 23, 2025
73d13c8
put the consts of nfd package in its correct directory not in the tes…
AdamKaabyia Jan 24, 2025
18a07f1
put the vars of nfd package in its correct directory not in the tests…
AdamKaabyia Jan 24, 2025
f7ee7d1
gpu consts put in the gpu package and names adjusted to remove redund…
AdamKaabyia Jan 24, 2025
5b4adea
added some time consts in the consts file of the package of nvidiagpu…
AdamKaabyia Jan 24, 2025
6f851da
added a 2 consts to the nfd package and used them
AdamKaabyia Jan 25, 2025
b7aa1ea
1. seperated global variables from nfd vars
AdamKaabyia Jan 25, 2025
6122a06
adding a PCI whitelists set util
Dec 30, 2024
cd52b1d
Bump github.com/golang/glog from 1.2.0 to 1.2.4 in the go_modules group
dependabot[bot] Jan 28, 2025
a26a2dd
Added Untitled Diagram.drawio
AdamKaabyia Jan 29, 2025
e238d74
Revert "1. seperated global variables from nfd vars"
AdamKaabyia Jan 29, 2025
82a9b05
Merge remote-tracking branch 'origin/nfd-refractoring' into nfd-refra…
AdamKaabyia Jan 29, 2025
fa13690
removed the diagram i added by accident
AdamKaabyia Jan 29, 2025
d2f0fd5
Merge branch 'rh-ecosystem-edge:main' into nfd-refractoring
AdamKaabyia Jan 29, 2025
8f8535f
Making Must gather function a generic one
TomerNewman Jan 21, 2025
a3ada2b
Merge pull request #49 from rh-ecosystem-edge/add_white_list
ggordaniRed Jan 29, 2025
c7e0ed9
Merge branch 'rh-ecosystem-edge:main' into nfd-refractoring
AdamKaabyia Jan 29, 2025
3eac99c
fixed the double declaration of nfd config, the vars that are used in…
AdamKaabyia Feb 4, 2025
3b7e187
fixed error:
AdamKaabyia Feb 6, 2025
d03b8d6
fixed error in NNO testing
AdamKaabyia Feb 9, 2025
fd2409c
1. the CheckNfdInstallation seperated and put in the nfd package
AdamKaabyia Feb 10, 2025
ab36fbb
created a gpuBurnConfig.go that has the default configurations of a b…
AdamKaabyia Feb 13, 2025
c47b3f8
ranamed the file from vars to config
AdamKaabyia Feb 13, 2025
98d26b2
seperated the nfdccheck from the nfd package for SOC(seperation of co…
AdamKaabyia Feb 14, 2025
f186566
Update pkg/nfd/config.go
AdamKaabyia Feb 14, 2025
e2fdaf7
put all the funcs that use nfd in nfd pkg, next step is to seperate i…
AdamKaabyia Feb 16, 2025
eadfb14
Merge remote-tracking branch 'origin/nfd-refractoring' into nfd-refra…
AdamKaabyia Feb 16, 2025
ba84157
added the EnsureNFDIsInstalled in deplo-nno-testing
AdamKaabyia Feb 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ require (
github.com/Mellanox/network-operator v1.4.0
github.com/NVIDIA/gpu-operator v1.8.3-0.20240429200431-0fe1e8db32b0
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240214071211-ea58a3ada15c
github.com/golang/glog v1.2.0
github.com/golang/glog v1.2.4
github.com/kelseyhightower/envconfig v1.4.0
github.com/onsi/ginkgo/v2 v2.17.1
github.com/onsi/gomega v1.32.0
Expand All @@ -53,6 +53,7 @@ require (
github.com/operator-framework/operator-lifecycle-manager v0.22.0
go.uber.org/mock v0.4.0
gopkg.in/k8snetworkplumbingwg/multus-cni.v4 v4.0.2
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.30.1
k8s.io/apiextensions-apiserver v0.29.3
k8s.io/apimachinery v0.30.1
Expand Down Expand Up @@ -147,7 +148,6 @@ require (
gopkg.in/evanphx/json-patch.v5 v5.7.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cli-runtime v0.29.1 // indirect
k8s.io/component-base v0.29.3 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang-migrate/migrate/v4 v4.17.0 h1:rd40H3QXU0AA4IoLllFcEAEo9dYKRHYND2gB4p7xcaU=
github.com/golang-migrate/migrate/v4 v4.17.0/go.mod h1:+Cp2mtLP4/aXDTKb9wmXYitdrNx2HGs45rbWAo6OsKM=
github.com/golang/glog v1.2.0 h1:uCdmnmatrKCgMBlM4rMuJZWOkPDqdbZPnrMXDY4gI68=
github.com/golang/glog v1.2.0/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w=
github.com/golang/glog v1.2.4 h1:CNNw5U8lSiiBk7druxtSHHTsRWcxKoac6kZKm2peBBc=
github.com/golang/glog v1.2.4/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
Expand Down
2 changes: 1 addition & 1 deletion internal/nvidiagpuconfig/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ type NvidiaGPUConfig struct {
BundleImage string `envconfig:"NVIDIAGPU_BUNDLE_IMAGE"`
OperatorUpgradeToChannel string `envconfig:"NVIDIAGPU_SUBSCRIPTION_UPGRADE_TO_CHANNEL"`
GPUFallbackCatalogsourceIndexImage string `envconfig:"NVIDIAGPU_GPU_FALLBACK_CATALOGSOURCE_INDEX_IMAGE"`
NFDFallbackCatalogsourceIndexImage string `envconfig:"NVIDIAGPU_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE"`
NFDFallbackCatalogsourceIndexImage string `envconfig:"NVIDIAGPU_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE"` // should this be here?
Copy link
Member

@empovit empovit Feb 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it shouldn't. It belongs to the NFD package and shouldn't be specific to GPU, must cover any scenario that involves NFD without ambiguities - NFD + GPU, NFD + NNO, NFD + GPU + NNO. Which means we'll have a single NFD_FALLBACK_CATALOGSOURCEC_INDEX_IMAGE instead of NVIDIAGPU_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE and NVIDIANETWORK_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE.
For backward compatibility, we may have a translation NVIDIAGPU_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE > NFD_FALLBACK_CATALOGSOURCEC_INDEX_IMAGE (and a similar one for NNO) until the CI jobs are fixed.

}

// NewNvidiaGPUConfig returns instance of NvidiaGPUConfig type.
Expand Down
2 changes: 1 addition & 1 deletion internal/nvidianetworkconfig/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type NvidiaNetworkConfig struct {
BundleImage string `envconfig:"NVIDIANETWORK_BUNDLE_IMAGE"`
OperatorUpgradeToChannel string `envconfig:"NVIDIANETWORK_SUBSCRIPTION_UPGRADE_TO_CHANNEL"`
NNOFallbackCatalogsourceIndexImage string `envconfig:"NVIDIANETWORK_NNO_FALLBACK_CATALOGSOURCE_INDEX_IMAGE"`
NFDFallbackCatalogsourceIndexImage string `envconfig:"NVIDIANETWORK_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE"`
NFDFallbackCatalogsourceIndexImage string `envconfig:"NVIDIANETWORK_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE"` // should this be here?
}

// NewNvidiaNetworkConfig returns instance of NvidiaNetworkConfig type.
Expand Down
8 changes: 8 additions & 0 deletions pkg/global/const.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package global

const (
//not related to NFD but common consts between gpu and nno
UndefinedValue = "undefined"
OperatorVersionFile = "operator.version"
OpenShiftVersionFile = "ocp.version"
)
58 changes: 58 additions & 0 deletions pkg/nfd/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package nfd

import (
. "github.com/rh-ecosystem-edge/nvidia-ci/pkg/global"
"time"
)

type CustomConfig struct {
CustomCatalogSourceIndexImage string
CreateCustomCatalogsource bool

CustomCatalogSource string
CatalogSource string
CleanupAfterInstall bool
}

// NewCustomConfig creates a new CustomConfig instance with default settings.
// All string fields are initialized to UndefinedValue and boolean fields to false.
func NewCustomConfig() *CustomConfig {
return &CustomConfig{
CustomCatalogSourceIndexImage: UndefinedValue,
CreateCustomCatalogsource: false,

CustomCatalogSource: UndefinedValue,
CatalogSource: UndefinedValue,
CleanupAfterInstall: false,
}
}
Comment on lines +8 to +28
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this can be shared by all operators that might need a custom catalog source, not limited to NFD.


// NfdParams holds all the configuration details required to install or manage
// the Node Feature Discovery (NFD) operator on a cluster.
type NfdParams struct {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK this has to be NFDParams according to Go conventions.

// OLM package name (as seen in the package manifest)
Package string

// Where the NFD operator is typically found in the default OperatorHub
CatalogSourceDefault string
CatalogSourceNamespace string

// Whether to create a custom CatalogSource if the default one doesn't contain NFD
CreateCustomCatalogsource bool

// Custom CatalogSource details (used if CreateCustomCatalogsource is true)
CustomCatalogSource string
CustomCatalogSourceIndexImage string
CustomCatalogSourceDisplayName string

// Operator installation details
OperatorDeploymentName string
OperatorNamespace string

// Time intervals for checking operator readiness
OperatorCheckInterval time.Duration
OperatorTimeout time.Duration

// Flag indicating whether to remove/clean up NFD after the installation/test
CleanupAfterInstall bool
}
21 changes: 21 additions & 0 deletions pkg/nfd/consts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package nfd

import "time"

const (
CustomNFDCatalogSourcePublisherName = "Red Hat"
CustomCatalogSourceDisplayName = "Redhat Operators Custom"
RhcosLabel = "feature.node.kubernetes.io/system-os_release.ID"
RhcosLabelValue = "rhcos"
OperatorNamespace = "openshift-nfd"
CatalogSourceDefault = "redhat-operators"
CatalogSourceNamespace = "openshift-marketplace"
OperatorDeploymentName = "nfd-controller-manager"
Package = "nfd"
CRName = "nfd-instance"

NFDOperatorCheckInterval = 30 * time.Second
NFDOperatorTimeout = 5 * time.Minute
resourceCRD = "NodeFeatureDiscovery"
LogLevel = 100
)
38 changes: 28 additions & 10 deletions internal/deploy/deploy-nfd.go → pkg/nfd/deploy-nfd.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
package deploy
package nfd

import (
"context"
"encoding/json"
"fmt"
"gopkg.in/k8snetworkplumbingwg/multus-cni.v4/pkg/logging"
"time"

"github.com/golang/glog"
. "github.com/onsi/gomega"
"github.com/operator-framework/api/pkg/operators/v1alpha1"
"github.com/rh-ecosystem-edge/nvidia-ci/internal/get"
"github.com/rh-ecosystem-edge/nvidia-ci/internal/gpuparams"
nvidiagpuwait "github.com/rh-ecosystem-edge/nvidia-ci/internal/wait"
"github.com/rh-ecosystem-edge/nvidia-ci/pkg/clients"
"github.com/rh-ecosystem-edge/nvidia-ci/pkg/deployment"
"github.com/rh-ecosystem-edge/nvidia-ci/pkg/namespace"
"github.com/rh-ecosystem-edge/nvidia-ci/pkg/nfd"
"github.com/rh-ecosystem-edge/nvidia-ci/pkg/olm"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
Expand Down Expand Up @@ -44,8 +45,7 @@ func CreateNFDNamespace(apiClient *clients.Settings) error {
createdNfdNsBuilder, err := nfdNsBuilder.Create()

if err != nil {
glog.V(gpuparams.GpuLogLevel).Infof("error creating NFD namespace '%s' : %v ",
createdNfdNsBuilder.Definition.Name, err)
glog.V(gpuparams.GpuLogLevel).Infof("error creating NFD namespace '%s' : %v ", createdNfdNsBuilder.Definition.Name, err)

return err
}
Expand All @@ -64,8 +64,7 @@ func CreateNFDNamespace(apiClient *clients.Settings) error {
newLabeledNfdNsBuilder, err := labeledNfdNsBuilder.Update()

if err != nil {
glog.V(gpuparams.GpuLogLevel).Infof("error labeling NFD namespace %S : %v ",
newLabeledNfdNsBuilder.Definition.Name, err)
glog.V(gpuparams.GpuLogLevel).Infof("error labeling NFD namespace %S : %v ", newLabeledNfdNsBuilder.Definition.Name, err)

return err
}
Expand Down Expand Up @@ -263,7 +262,7 @@ func DeployCRInstance(apiClient *clients.Settings) error {

glog.V(gpuparams.GpuLogLevel).Infof("Creating NodeFeatureDiscovery instance from CSV almExamples")

nodeFeatureDiscoveryBuilder := nfd.NewBuilderFromObjectString(apiClient, almExamples)
nodeFeatureDiscoveryBuilder := NewBuilderFromObjectString(apiClient, almExamples)

_, err = nodeFeatureDiscoveryBuilder.Create()

Expand Down Expand Up @@ -313,7 +312,7 @@ func GetNFDCRJson(apiClient *clients.Settings, nfdCRName string, nfdNamespace st
glog.V(gpuparams.GpuLogLevel).Infof("Pull the NodeFeatureDiscovery just created from cluster, " +
"with updated fields")

pulledNodeFeatureDiscovery, err := nfd.Pull(apiClient, nfdCRName, nfdNamespace)
pulledNodeFeatureDiscovery, err := Pull(apiClient, nfdCRName, nfdNamespace)

if err != nil {
glog.V(gpuparams.GpuLogLevel).Infof("error pulling NodeFeatureDiscovery %s from "+
Expand Down Expand Up @@ -343,7 +342,7 @@ func NFDCRDeleteAndWait(apiClient *clients.Settings, nfdCRName string, nfdCRName
// return wait.PollImmediate(pollInterval, timeout, func() (bool, error) {
return wait.PollUntilContextTimeout(
context.TODO(), pollInterval, timeout, false, func(ctx context.Context) (bool, error) {
nfdCR, err := nfd.Pull(apiClient, nfdCRName, nfdCRNamespace)
nfdCR, err := Pull(apiClient, nfdCRName, nfdCRNamespace)

if err != nil {
glog.V(gpuparams.GpuLogLevel).Infof("NodeFeatureDiscovery pull from cluster error: %s\n", err)
Expand Down Expand Up @@ -472,6 +471,25 @@ func DeleteAnyNFDCSV(apiClient *clients.Settings) error {
return err
}
}

return nil
}

func CreateNFDDeployment(apiClient *clients.Settings, catalogSource, operatorDeploymentName, operatorNamespace string, checkInterval, timeout time.Duration, logLevel logging.Level) bool {
glog.V(glog.Level(logLevel)).Info("Deploying NFD Subscription")
err := CreateNFDSubscription(apiClient, catalogSource)
Expect(err).ToNot(HaveOccurred(), "error creating NFD Subscription: %v", err)

glog.V(glog.Level(logLevel)).Info("Sleeping for 2 minutes to allow the NFD Operator deployment to stabilize")
time.Sleep(2 * time.Minute)

glog.V(glog.Level(logLevel)).Infof("Waiting up to %v for NFD Operator deployment to be fully created", timeout)
nfdDeploymentCreated := nvidiagpuwait.DeploymentCreated(apiClient, operatorDeploymentName, operatorNamespace, checkInterval, timeout)
Expect(nfdDeploymentCreated).ToNot(BeFalse(), "timed out waiting for NFD operator deployment")

glog.V(glog.Level(logLevel)).Info("Checking if NFD Operator deployment is active")
nfdDeployed, err := CheckNFDOperatorDeployed(apiClient, 4*time.Minute)
Expect(err).ToNot(HaveOccurred(), "error deploying NFD Operator in NFD namespace: %v", err)

return nfdDeployed
}
Loading