dmwm · amaltaro · Aug 15, 2023 · Aug 15, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/src/python/Utils/Utilities.py b/src/python/Utils/Utilities.py
@@ -10,6 +10,8 @@
 import sys
 from types import ModuleType, FunctionType
 from gc import get_referents
+from distutils.version import StrictVersion
+
 
 def lowerCmsHeaders(headers):
     """
@@ -295,3 +297,21 @@ def encodeUnicodeToBytesConditional(value, errors="ignore", condition=True):
     if condition:
         return encodeUnicodeToBytes(value, errors)
     return value
+
+
+def orderVersionList(versionList):
+    """
+    This function will order a list of version-style strings.
+    The order of precedence digits is from left to right. E.g.:
+      from: ["2.3.1", "1.2.3", "3.2.1", "1.3.2"]
+      to:   ["1.2.3", "1.3.2", "2.3.1", "3.2.1"]
+    :param versionList: list of strings
+    :return: an ordered list; or the initial data if different than list.
+
+    NOTE: implementation suggested in:
+    https://stackoverflow.com/questions/2574080/sorting-a-list-of-dot-separated-numbers-like-software-versions
+    """
+    if not isinstance(versionList, list):
+        return versionList
+    versionList.sort(key=StrictVersion)
+    return versionList
diff --git a/src/python/WMCore/BossAir/Plugins/BasePlugin.py b/src/python/WMCore/BossAir/Plugins/BasePlugin.py
@@ -8,7 +8,7 @@
 from builtins import object, str, bytes
 from future.utils import viewvalues
 
-from Utils.Utilities import decodeBytesToUnicode
+from Utils.Utilities import decodeBytesToUnicode, orderVersionList
 from WMCore.WMException import WMException
 from WMCore.WMRuntime.Tools.Scram import ARCH_TO_OS, SCRAM_TO_ARCH
 
@@ -181,3 +181,31 @@ def scramArchtoRequiredArch(scramArch=None):
             archs = defaultArch
 
         return archs
+
+    @staticmethod
+    def cudaCapabilityToSingleVersion(capabilities=None):
+        """
+        Given a list of CUDA capabilities (with strings in a version style),
+        finds the smallest version required and convert it to a single integer
+        for comparison/job matchmaking purposes.
+        Version conversion formula is: (1000 * major + 10 * medium + minor)
+        :param capabilities: a list of string versions
+        :return: an integer with the version value; 0 in case of failure
+
+        For further details:
+        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html
+        """
+        defaultRes = 0
+        # get an ordered list of the versions and use the very first element
+        capabilities = orderVersionList(capabilities)
+        if not capabilities:
+            return defaultRes
+
+        smallestVersion = capabilities[0]
+        smallestVersion = smallestVersion.split(".")
+        # deal with versions like: "1", "1.2" and "1.2.3"
+        for _i in range(0, 3 - len(smallestVersion)):
+            smallestVersion.append(0)
+
+        intVersion = int(smallestVersion[0]) * 1000 + int(smallestVersion[1]) * 10 + int(smallestVersion[2])
+        return intVersion
diff --git a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py
@@ -566,8 +566,11 @@ def getJobParameters(self, jobList):
             if job.get('gpuRequirements', None):
                 ad['My.GPUMemoryMB'] = str(job['gpuRequirements']['GPUMemoryMB'])
                 cudaCapabilities = ','.join(sorted(job['gpuRequirements']['CUDACapabilities']))
-                ad['My.CUDACapability'] = classad.quote(str(cudaCapabilities))
-                ad['My.CUDARuntime'] = classad.quote(job['gpuRequirements']['CUDARuntime'])
+                minimalCapability = self.cudaCapabilityToSingleVersion(job['gpuRequirements']['CUDACapabilities'])
+                ad['My.CUDACapability'] = classad.quote(str(minimalCapability))
+                ad['My.OriginalCUDACapability'] = classad.quote(str(cudaCapabilities))
+                cudaRuntime = ','.join(sorted(job['gpuRequirements']['CUDARuntime']))
+                ad['My.CUDARuntime'] = classad.quote(str(cudaRuntime))
             else:
                 ad['My.GPUMemoryMB'] = undefined
                 ad['My.CUDACapability'] = undefined

diff --git a/src/python/WMCore/WMSpec/WMTask.py b/src/python/WMCore/WMSpec/WMTask.py
@@ -1525,16 +1525,31 @@ def getRequiresGPU(self):
     def getGPURequirements(self):
         """
         Return the GPU requirements for this task.
-        If it's a multi-step task, the first step with a meaningful
-        dictionary value will be returned
+        For multi-step tasks, the following logic is applied:
+          * GPUMemoryMB: return the max of them
+          * CUDARuntime: returns a flat list of unique runtime versions
+          * CUDACapabilities: returns a flat list of unique capabilities
         :return: a dictionary with the GPU requirements for this task
         """
-        gpuRequirements = {}
+        gpuRequirements = []
         for stepName in sorted(self.listAllStepNames()):
             stepHelper = self.getStep(stepName)
             if stepHelper.stepType() == "CMSSW" and stepHelper.getGPURequirements():
-                return stepHelper.getGPURequirements()
-        return gpuRequirements
+                gpuRequirements.append(stepHelper.getGPURequirements())
+        if not gpuRequirements:
+            return {}
+
+        # in this case, it requires GPUs and it can be multi-steps GPU
+        bestGPUParams = {"GPUMemoryMB": 0, "CUDARuntime": [], "CUDACapabilities": []}
+        for params in gpuRequirements:
+            if params["GPUMemoryMB"] > bestGPUParams["GPUMemoryMB"]:
+                bestGPUParams["GPUMemoryMB"] = params["GPUMemoryMB"]
+            bestGPUParams["CUDARuntime"].append(params["CUDARuntime"])
+            bestGPUParams["CUDACapabilities"].extend(params["CUDACapabilities"])
+        # make the flat list elements unique
+        bestGPUParams["CUDARuntime"] = list(set(bestGPUParams["CUDARuntime"]))
+        bestGPUParams["CUDACapabilities"] = list(set(bestGPUParams["CUDACapabilities"]))
+        return bestGPUParams
 
     def _getStepValue(self, keyDict, defaultValue):
         """

diff --git a/test/python/Utils_t/Utilities_t.py b/test/python/Utils_t/Utilities_t.py
@@ -8,7 +8,7 @@
 
 from Utils.Utilities import makeList, makeNonEmptyList, strToBool, \
     safeStr, rootUrlJoin, zipEncodeStr, lowerCmsHeaders, getSize, \
-    encodeUnicodeToBytes, diskUse, numberCouchProcess
+    encodeUnicodeToBytes, diskUse, numberCouchProcess, orderVersionList
 
 
 class UtilitiesTests(unittest.TestCase):
@@ -180,6 +180,16 @@ def testNumberCouchProcess(self):
         # there should be at least one process, but who knows...
         self.assertTrue(data >= 0)
 
+    def testOrderVersionList(self):
+        """
+        Test the `orderVersionList` function.
+        """
+        oldL = ["2.3.1", "1.2.3", "3.2.1", "1.3.2", "1.2"]
+        newL = ["1.2", "1.2.3", "1.3.2", "2.3.1", "3.2.1"]
+        with self.assertRaises(AssertionError):
+            self.assertListEqual(oldL, newL)
+        self.assertListEqual(orderVersionList(oldL), newL)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/python/WMCore_t/BossAir_t/BasePlugin_t.py b/test/python/WMCore_t/BossAir_t/BasePlugin_t.py
@@ -78,5 +78,26 @@ def testScramArchtoRequiredArch(self):
 
         return
 
+    def testCudaCapabilityToSingleVersion(self):
+        """
+        Test conversion of a list of version strings to a single integer version
+        """
+        bp = BasePlugin(config=None)
+
+        # bad input
+        self.assertEqual(bp.cudaCapabilityToSingleVersion([]), 0)
+        self.assertEqual(bp.cudaCapabilityToSingleVersion({}), 0)
+        self.assertEqual(bp.cudaCapabilityToSingleVersion(None), 0)
+        # good and expected input
+        unorderedL = ["2.3.1", "1.2.3", "3.2.1", "1.3.2", "1.2"]
+        self.assertEqual(bp.cudaCapabilityToSingleVersion(unorderedL), 1020)
+        orderedL = ["1.2", "1.2.3", "1.3.2", "2.3.1", "3.2.1"]
+        self.assertEqual(bp.cudaCapabilityToSingleVersion(orderedL), 1020)
+        orderedL = ["1.2.3", "1.3.2", "2.3.1", "3.2.1"]
+        self.assertEqual(bp.cudaCapabilityToSingleVersion(orderedL), 1023)
+        orderedL = ["2.3.1", "3.2.1"]
+        self.assertEqual(bp.cudaCapabilityToSingleVersion(orderedL), 2031)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/python/WMCore_t/WMSpec_t/StdSpecs_t/StepChain_t.py b/test/python/WMCore_t/WMSpec_t/StdSpecs_t/StepChain_t.py
@@ -2489,9 +2489,10 @@ def testGPUStepChainsTasks(self):
             testArguments[s]['ConfigCacheID'] = configDocs[s]
         testArguments['Step2']['KeepOutput'] = False
 
-        gpuParams = {"GPUMemoryMB": 1234, "CUDARuntime": "11.2.3", "CUDACapabilities": ["7.5", "8.0"]}
-        testArguments['Step1'].update({"RequiresGPU": "optional", "GPUParams": json.dumps(gpuParams)})
-        testArguments['Step2'].update({"RequiresGPU": "required", "GPUParams": json.dumps(gpuParams)})
+        gpuParams1 = {"GPUMemoryMB": 1234, "CUDARuntime": "11.2.3", "CUDACapabilities": ["7.5", "8.0"]}
+        testArguments['Step1'].update({"RequiresGPU": "optional", "GPUParams": json.dumps(gpuParams1)})
+        gpuParams2 = {"GPUMemoryMB": 2345, "CUDARuntime": "9.6", "CUDACapabilities": ["7.4"]}
+        testArguments['Step2'].update({"RequiresGPU": "required", "GPUParams": json.dumps(gpuParams2)})
         factory = StepChainWorkloadFactory()
         testWorkload = factory.factoryWorkloadConstruction("TestWorkload", testArguments)
 
@@ -2503,8 +2504,8 @@ def testGPUStepChainsTasks(self):
 
         # validate GPU parameters
         self.assertEqual(testArguments['GPUParams'], json.dumps(None))
-        self.assertEqual(testArguments["Step1"]['GPUParams'], json.dumps(gpuParams))
-        self.assertEqual(testArguments["Step2"]['GPUParams'], json.dumps(gpuParams))
+        self.assertEqual(testArguments["Step1"]['GPUParams'], json.dumps(gpuParams1))
+        self.assertEqual(testArguments["Step2"]['GPUParams'], json.dumps(gpuParams2))
         self.assertTrue("GPUParams" not in testArguments["Step3"])
 
         for taskName in testWorkload.listAllTaskNames():
@@ -2520,10 +2521,10 @@ def testGPUStepChainsTasks(self):
                 elif stepHelper.stepType() == "CMSSW" and taskName == "GENSIM":
                     if stepHelper.name() == "cmsRun1":
                         self.assertEqual(stepHelper.data.application.gpu.gpuRequired, testArguments["Step1"]['RequiresGPU'])
-                        self.assertItemsEqual(stepHelper.data.application.gpu.gpuRequirements, gpuParams)
+                        self.assertItemsEqual(stepHelper.data.application.gpu.gpuRequirements, gpuParams1)
                     elif stepHelper.name() == "cmsRun2":
                         self.assertEqual(stepHelper.data.application.gpu.gpuRequired, testArguments["Step2"]['RequiresGPU'])
-                        self.assertItemsEqual(stepHelper.data.application.gpu.gpuRequirements, gpuParams)
+                        self.assertItemsEqual(stepHelper.data.application.gpu.gpuRequirements, gpuParams2)
                     elif stepHelper.name() == "cmsRun3":
                         self.assertEqual(stepHelper.data.application.gpu.gpuRequired, "forbidden")
                         self.assertIsNone(stepHelper.data.application.gpu.gpuRequirements)
@@ -2535,18 +2536,17 @@ def testGPUStepChainsTasks(self):
         prodTask = testWorkload.getTask('GENSIM')
         gpuRequired, gpuRequirements = prodTask.getStepHelper('cmsRun1').getGPUSettings()
         self.assertEqual(gpuRequired, testArguments["Step1"]['RequiresGPU'])
-        self.assertItemsEqual(gpuRequirements, gpuParams)
+        self.assertItemsEqual(gpuRequirements, gpuParams1)
 
         gpuRequired, gpuRequirements = prodTask.getStepHelper('cmsRun2').getGPUSettings()
         self.assertEqual(gpuRequired, testArguments["Step2"]['RequiresGPU'])
-        self.assertItemsEqual(gpuRequirements, gpuParams)
+        self.assertItemsEqual(gpuRequirements, gpuParams2)
 
         gpuRequired, gpuRequirements = prodTask.getStepHelper('cmsRun3').getGPUSettings()
         self.assertEqual(gpuRequired, testArguments["Step3"].get('RequiresGPU', "forbidden"))
         self.assertIsNone(gpuRequirements)
 
 
-
         # test assignment with wrong Trust flags
         assignDict = {"SiteWhitelist": ["T2_US_Nebraska"], "Team": "The-A-Team",
                       "RequestStatus": "assigned"}
@@ -2560,18 +2560,18 @@ def testGPUStepChainsTasks(self):
 
         # validate GPU parameters
         self.assertEqual(testArguments['GPUParams'], json.dumps(None))
-        self.assertEqual(testArguments["Step1"]['GPUParams'], json.dumps(gpuParams))
-        self.assertEqual(testArguments["Step2"]['GPUParams'], json.dumps(gpuParams))
+        self.assertEqual(testArguments["Step1"]['GPUParams'], json.dumps(gpuParams1))
+        self.assertEqual(testArguments["Step2"]['GPUParams'], json.dumps(gpuParams2))
         self.assertTrue("GPUParams" not in testArguments["Step3"])
 
         prodTask = testWorkload.getTask('GENSIM')
         gpuRequired, gpuRequirements = prodTask.getStepHelper('cmsRun1').getGPUSettings()
         self.assertEqual(gpuRequired, testArguments["Step1"]['RequiresGPU'])
-        self.assertItemsEqual(gpuRequirements, gpuParams)
+        self.assertItemsEqual(gpuRequirements, gpuParams1)
 
         gpuRequired, gpuRequirements = prodTask.getStepHelper('cmsRun2').getGPUSettings()
         self.assertEqual(gpuRequired, testArguments["Step2"]['RequiresGPU'])
-        self.assertItemsEqual(gpuRequirements, gpuParams)
+        self.assertItemsEqual(gpuRequirements, gpuParams2)
 
         gpuRequired, gpuRequirements = prodTask.getStepHelper('cmsRun3').getGPUSettings()
         self.assertEqual(gpuRequired, testArguments["Step3"].get('RequiresGPU', "forbidden"))

diff --git a/test/python/WMCore_t/WMSpec_t/WMTask_t.py b/test/python/WMCore_t/WMSpec_t/WMTask_t.py
@@ -858,6 +858,8 @@ def testGPUTaskSettings(self):
         ### Now set a single value for both tasks
         gpuParams = {"GPUMemoryMB": 1234, "CUDARuntime": "11.2.3", "CUDACapabilities": ["7.5", "8.0"]}
         task1.setTaskGPUSettings("required", json.dumps(gpuParams))
+        # CUDARuntime returns as a list
+        gpuParams["CUDARuntime"] = [gpuParams["CUDARuntime"]]
         for taskObj in task1.taskIterator():
             # task level check
             self.assertEqual(taskObj.getRequiresGPU(), "required")