From a4da73dc43c25800b944b43695df23878f9634c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Sat, 1 Jun 2024 19:18:16 +0300 Subject: [PATCH 1/6] Count atomic --- src/GraphBLAS-sharp.Backend/Common/ClArray.fs | 36 ++++++++++++++----- src/GraphBLAS-sharp.Backend/Common/Utils.fs | 4 +++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs index 9c21781e..7c3d5594 100644 --- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs +++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs @@ -902,22 +902,40 @@ module ClArray = let count<'a> (predicate: Expr<'a -> bool>) (clContext: ClContext) workGroupSize = - let sum = - Reduce.reduce <@ (+) @> clContext workGroupSize + let count = + <@ fun (ndRange: Range1D) (length: int) (array: ClArray<'a>) (count: ClCell) -> + let gid = ndRange.GlobalID0 + let mutable countLocal = 0 + let gSize = ndRange.GlobalWorkSize - let getBitmap = - Map.map<'a, int> (Map.predicateBitmap predicate) clContext workGroupSize + let mutable i = gid + + while i < length do + let res = (%predicate) array.[i] + if res then countLocal <- countLocal + 1 + i <- i + gSize + + atomic (+) count.Value countLocal |> ignore @> + + let count = clContext.Compile count fun (processor: RawCommandQueue) (array: ClArray<'a>) -> - let bitmap = getBitmap processor DeviceOnly array + let result = clContext.CreateClCell(0) - let result = - (sum processor bitmap).ToHostAndFree processor + let numberOfGroups = + Utils.divUpClamp array.Length workGroupSize 1 1024 - bitmap.Free() + let ndRange = + Range1D.CreateValid(workGroupSize * numberOfGroups, workGroupSize) - result + let kernel = count.GetKernel() + + kernel.KernelFunc ndRange array.Length array result + + processor.RunKernel kernel + + result.ToHostAndFree processor /// /// Builds a new array whose elements are the results of applying the given function diff --git a/src/GraphBLAS-sharp.Backend/Common/Utils.fs b/src/GraphBLAS-sharp.Backend/Common/Utils.fs index 3ef10555..ef4c3371 100644 --- a/src/GraphBLAS-sharp.Backend/Common/Utils.fs +++ b/src/GraphBLAS-sharp.Backend/Common/Utils.fs @@ -19,6 +19,10 @@ module internal Utils = >> fun x -> x ||| (x >>> 16) >> fun x -> x + 1 + let divUp x y = x / y + (if x % y = 0 then 0 else 1) + + let divUpClamp x y left right = min (max (divUp x y) left) right + let floorToMultiple multiple x = x / multiple * multiple let ceilToMultiple multiple x = ((x - 1) / multiple + 1) * multiple From b6e13fd5af1cbc60f118b4053fdda032c45422dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Sat, 1 Jun 2024 19:54:17 +0300 Subject: [PATCH 2/6] SpMSpVMasked --- src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs | 34 +-- src/GraphBLAS-sharp.Backend/Common/ClArray.fs | 4 +- .../Operations/Operations.fs | 46 +++- .../Operations/SpMSpV.fs | 215 ++++++++++++++++++ .../Vector/Dense/Vector.fs | 56 ----- 5 files changed, 274 insertions(+), 81 deletions(-) diff --git a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs index 2c2e3011..f5387bb0 100644 --- a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs +++ b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs @@ -135,7 +135,7 @@ module internal BFS = Operations.SpMVInPlace add mul clContext workGroupSize let spMSpV = - Operations.SpMSpVBool add mul clContext workGroupSize + Operations.SpMSpVMaskedBool add mul clContext workGroupSize let zeroCreate = Vector.zeroCreate clContext workGroupSize @@ -145,9 +145,6 @@ module internal BFS = let maskComplementedInPlace = Vector.map2InPlace Mask.complementedOp clContext workGroupSize - let maskComplemented = - Vector.map2Sparse Mask.complementedOp clContext workGroupSize - let fillSubVectorInPlace = Vector.assignByMaskInPlace (Mask.assign) clContext workGroupSize @@ -190,28 +187,21 @@ module internal BFS = match frontier with | ClVector.Sparse _ -> //Getting new frontier - match spMSpV queue matrix frontier with + match spMSpV queue matrix frontier levels with | None -> frontier.Dispose() stop <- true - | Some newFrontier -> + | Some newMaskedFrontier -> frontier.Dispose() - //Filtering visited vertices - match maskComplemented queue DeviceOnly newFrontier levels with - | None -> - stop <- true - newFrontier.Dispose() - | Some newMaskedFrontier -> - newFrontier.Dispose() - - //Push/pull - let NNZ = getNNZ queue newMaskedFrontier - - if (push NNZ newMaskedFrontier.Size) then - frontier <- newMaskedFrontier - else - frontier <- toDense queue DeviceOnly newMaskedFrontier - newMaskedFrontier.Dispose() + + //Push/pull + let NNZ = getNNZ queue newMaskedFrontier + + if (push NNZ newMaskedFrontier.Size) then + frontier <- newMaskedFrontier + else + frontier <- toDense queue DeviceOnly newMaskedFrontier + newMaskedFrontier.Dispose() | ClVector.Dense oldFrontier -> //Getting new frontier spMVInPlace queue matrix frontier frontier diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs index 7c3d5594..d48863ae 100644 --- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs +++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs @@ -906,14 +906,14 @@ module ClArray = <@ fun (ndRange: Range1D) (length: int) (array: ClArray<'a>) (count: ClCell) -> let gid = ndRange.GlobalID0 let mutable countLocal = 0 - let gSize = ndRange.GlobalWorkSize + let step = ndRange.GlobalWorkSize let mutable i = gid while i < length do let res = (%predicate) array.[i] if res then countLocal <- countLocal + 1 - i <- i + gSize + i <- i + step atomic (+) count.Value countLocal |> ignore @> diff --git a/src/GraphBLAS-sharp.Backend/Operations/Operations.fs b/src/GraphBLAS-sharp.Backend/Operations/Operations.fs index ff06f1d5..bc0f2c1d 100644 --- a/src/GraphBLAS-sharp.Backend/Operations/Operations.fs +++ b/src/GraphBLAS-sharp.Backend/Operations/Operations.fs @@ -331,7 +331,7 @@ module Operations = | _ -> failwith "Not implemented yet" /// - /// CSR Matrix - sparse vector multiplication. Optimized for bool OR and AND operations. + /// CSR Matrix - sparse vector multiplication. Optimized for bool OR and AND operations by skipping reduction stage. /// /// Type of binary function to reduce entries. /// Type of binary function to combine entries. @@ -352,6 +352,50 @@ module Operations = | ClMatrix.CSR m, ClVector.Sparse v -> Option.map ClVector.Sparse (run queue m v) | _ -> failwith "Not implemented yet" + /// + /// CSR Matrix - sparse vector multiplication with mask. Mask is complemented. + /// + /// Type of binary function to reduce entries. + /// Type of binary function to combine entries. + /// OpenCL context. + /// Should be a power of 2 and greater than 1. + let SpMSpVMasked + (add: Expr<'c option -> 'c option -> 'c option>) + (mul: Expr<'a option -> 'b option -> 'c option>) + (clContext: ClContext) + workGroupSize + = + + let run = + SpMSpV.Masked.runMasked add mul clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix<'a>) (vector: ClVector<'b>) (mask: ClVector<'d>) -> + match matrix, vector, mask with + | ClMatrix.CSR m, ClVector.Sparse v, ClVector.Dense mask -> Option.map ClVector.Sparse (run queue m v mask) + | _ -> failwith "Not implemented yet" + + /// + /// CSR Matrix - sparse vector multiplication with mask. Mask is complemented. Optimized for bool OR and AND operations by skipping reduction stage. + /// + /// Type of binary function to reduce entries. + /// Type of binary function to combine entries. + /// OpenCL context. + /// Should be a power of 2 and greater than 1. + let SpMSpVMaskedBool + (add: Expr bool option -> bool option>) + (mul: Expr bool option -> bool option>) + (clContext: ClContext) + workGroupSize + = + + let run = + SpMSpV.Masked.runMaskedBoolStandard add mul clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix<'a>) (vector: ClVector<'b>) (mask: ClVector<'d>) -> + match matrix, vector, mask with + | ClMatrix.CSR m, ClVector.Sparse v, ClVector.Dense mask -> Option.map ClVector.Sparse (run queue m v mask) + | _ -> failwith "Not implemented yet" + /// /// CSR Matrix - sparse vector multiplication. /// diff --git a/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs b/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs index e4f61fea..3f6f0908 100644 --- a/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs +++ b/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs @@ -290,3 +290,218 @@ module SpMSpV = Indices = resultIndices Values = create queue DeviceOnly resultIndices.Length true Size = matrix.ColumnCount }) + + module Masked = + + let private count (clContext: ClContext) workGroupSize = + + let count = + <@ fun (ndRange: Range1D) vectorLength (vectorIndices: ClArray) (vectorMask: ClArray<'d option>) (matrixRowPointers: ClArray) (matrixColumns: ClArray) (result: ClCell) -> + let gid = ndRange.GlobalID0 + let step = ndRange.GlobalWorkSize + + let mutable idx = gid + + while idx < vectorLength do + let vectorIndex = vectorIndices.[idx] + + let rowStart = matrixRowPointers.[vectorIndex] + let rowEnd = matrixRowPointers.[vectorIndex + 1] + + let mutable count = 0 + + for i in rowStart .. rowEnd - 1 do + match vectorMask.[matrixColumns.[i]] with + | None -> count <- count + 1 + | Some _ -> () + + atomic (+) result.Value count |> ignore + + idx <- idx + step @> + + let count = clContext.Compile count + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (vectorMask: ClArray<'d option>) -> + + let length = vector.NNZ + + let numberOfGroups = + Utils.divUpClamp length workGroupSize 1 1024 + + let result = clContext.CreateClCell(0) + + let ndRange = + Range1D.CreateValid(numberOfGroups * workGroupSize, workGroupSize) + + let count = count.GetKernel() + + count.KernelFunc ndRange length vector.Indices vectorMask matrix.RowPointers matrix.Columns result + + queue.RunKernel count + + result.ToHostAndFree queue + + let private multiplyValues + (clContext: ClContext) + (mul: Expr<'a option -> 'b option -> 'c option>) + workGroupSize + = + + let multiply = + <@ fun (ndRange: Range1D) resultLength (vectorIndices: ClArray) (vectorValues: ClArray<'b>) (vectorMask: ClArray<'d option>) (matrixRowPointers: ClArray) (matrixColumns: ClArray) (matrixValues: ClArray<'a>) (resultOffset: ClCell) (resultIndices: ClArray) (resultValues: ClArray<'c option>) -> + let gid = ndRange.GlobalID0 + let step = ndRange.GlobalWorkSize + + let mutable i = gid + + while i < resultLength do + let vectorIndex = vectorIndices.[i] + let vectorValue = vectorValues.[i] + + let rowStart = matrixRowPointers.[vectorIndex] + let rowEnd = matrixRowPointers.[vectorIndex + 1] + + let mutable count = 0 + + for i in rowStart .. rowEnd - 1 do + match vectorMask.[matrixColumns.[i]] with + | None -> count <- count + 1 + | Some _ -> () + + let mutable offset = atomic (+) resultOffset.Value count + + for i in rowStart .. rowEnd - 1 do + let columnIndex = matrixColumns.[i] + + // TODO: Pass mask operation + match vectorMask.[columnIndex] with + | None -> + resultIndices.[offset] <- columnIndex + resultValues.[offset] <- (%mul) (Some matrixValues.[i]) (Some vectorValue) + offset <- offset + 1 + | Some _ -> () + + i <- i + step @> + + let kernel = clContext.Compile multiply + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (vectorMask: ClArray<'d option>) (resultSize: int) -> + + let multipliedIndices = + clContext.CreateClArrayWithSpecificAllocationMode(DeviceOnly, resultSize) + + let multipliedValues = + clContext.CreateClArrayWithSpecificAllocationMode<'c option>(DeviceOnly, resultSize) + + let offset = clContext.CreateClCell 0 + + let numberOfGroups = + Utils.divUpClamp vector.NNZ workGroupSize 1 1024 + + let ndRange = + Range1D.CreateValid(numberOfGroups * workGroupSize, workGroupSize) + + let kernel = kernel.GetKernel() + + kernel.KernelFunc + ndRange + vector.NNZ + vector.Indices + vector.Values + vectorMask + matrix.RowPointers + matrix.Columns + matrix.Values + offset + multipliedIndices + multipliedValues + + queue.RunKernel kernel + + offset.Free() + + multipliedIndices, multipliedValues + + let runMasked + (add: Expr<'c option -> 'c option -> 'c option>) + (mul: Expr<'a option -> 'b option -> 'c option>) + (clContext: ClContext) + workGroupSize + = + + let count = count clContext workGroupSize + + let multiplyValues = + multiplyValues clContext mul workGroupSize + + let sort = + Sort.Bitonic.sortKeyValuesInplace clContext workGroupSize + + let segReduce = + Reduce.ByKey.Option.segmentSequential add clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (mask: ClArray<'d option>) -> + + match count queue matrix vector mask with + | 0 -> None + | resultSize -> + let multipliedIndices, multipliedValues = + multiplyValues queue matrix vector mask resultSize + + sort queue multipliedIndices multipliedValues + + let result = + segReduce queue DeviceOnly multipliedIndices multipliedValues + |> Option.map + (fun (reducedValues, reducedKeys) -> + { Context = clContext + Indices = reducedKeys + Values = reducedValues + Size = matrix.ColumnCount }) + + multipliedIndices.Free() + multipliedValues.Free() + + result + + let runMaskedBoolStandard + (add: Expr<'c option -> 'c option -> 'c option>) + (mul: Expr<'a option -> 'b option -> 'c option>) + (clContext: ClContext) + workGroupSize + = + + let count = count clContext workGroupSize + + let multiplyValues = + multiplyValues clContext mul workGroupSize + + let sort = + Sort.Bitonic.sortKeyValuesInplace clContext workGroupSize + + let removeDuplicates = + GraphBLAS.FSharp.ClArray.removeDuplications clContext workGroupSize + + let create = + GraphBLAS.FSharp.ClArray.create clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (mask: ClArray<'d option>) -> + + match count queue matrix vector mask with + | 0 -> None + | resultSize -> + let multipliedIndices, multipliedValues = + multiplyValues queue matrix vector mask resultSize + + sort queue multipliedIndices multipliedValues + + let resultIndices = removeDuplicates queue multipliedIndices + + multipliedIndices.Free() + multipliedValues.Free() + + Some + <| { Context = clContext + Indices = resultIndices + Values = create queue DeviceOnly resultIndices.Length true + Size = matrix.ColumnCount } diff --git a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs index 36a74c67..36dc6835 100644 --- a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs +++ b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs @@ -142,62 +142,6 @@ module Vector = let toSparse<'a when 'a: struct> (clContext: ClContext) workGroupSize = - let scatterValues = - Common.Scatter.lastOccurrence clContext workGroupSize - - let scatterIndices = - Common.Scatter.lastOccurrence clContext workGroupSize - - let getBitmap = - Map.map (Map.option 1 0) clContext workGroupSize - - let prefixSum = - Common.PrefixSum.standardExcludeInPlace clContext workGroupSize - - let allIndices = - ClArray.init Map.id clContext workGroupSize - - let allValues = - Map.map (Map.optionToValueOrZero Unchecked.defaultof<'a>) clContext workGroupSize - - fun (processor: RawCommandQueue) allocationMode (vector: ClArray<'a option>) -> - - let positions = getBitmap processor DeviceOnly vector - - let resultLength = - (prefixSum processor positions) - .ToHostAndFree(processor) - - // compute result indices - let resultIndices = - clContext.CreateClArrayWithSpecificAllocationMode(allocationMode, resultLength) - - let allIndices = - allIndices processor DeviceOnly vector.Length - - scatterIndices processor positions allIndices resultIndices - - allIndices.Free() - - // compute result values - let resultValues = - clContext.CreateClArrayWithSpecificAllocationMode<'a>(allocationMode, resultLength) - - let allValues = allValues processor DeviceOnly vector - - scatterValues processor positions allValues resultValues - - allValues.Free() - - positions.Free() - - { Context = clContext - Indices = resultIndices - Values = resultValues - Size = vector.Length } - - let toSparse2<'a when 'a: struct> (clContext: ClContext) workGroupSize = - let kernel = <@ fun (ndRange: Range1D) (inputLength: int) (inputValues: ClArray<'a option>) (resultSize: ClCell) (resultIndices: ClArray) (resultValues: ClArray<'a>) -> From 83ab36702fa2cb504f942a847e1f8544deb080ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Sat, 1 Jun 2024 20:08:01 +0300 Subject: [PATCH 3/6] Small fixes --- src/GraphBLAS-sharp.Backend/Common/ClArray.fs | 2 +- src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs index d48863ae..bc324850 100644 --- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs +++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs @@ -362,7 +362,7 @@ module ClArray = let gid = ndRange.GlobalID0 - if gid < length then + if gid < length && not result.Value then let isExist = (%predicate) vector.[gid] if isExist then result.Value <- true @> diff --git a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs index 724de235..c12538b2 100644 --- a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs +++ b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs @@ -257,6 +257,7 @@ module Bitonic = int (clContext.ClDevice.LocalMemSize) / (sizeof + sizeof<'a>) ) + / 2 let maxThreadsPerBlock = min (clContext.ClDevice.MaxWorkGroupSize) (localSize / 2) @@ -476,4 +477,4 @@ module Bitonic = kernelGlobal.KernelFunc ndRangeGlobal rows values values.Length (localSize * 2) - queue.RunKernel(kernelGlobal) \ No newline at end of file + queue.RunKernel(kernelGlobal) From fc0acd18d0cbba20344be9c1b28be15cc56e98b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Sat, 1 Jun 2024 20:14:06 +0300 Subject: [PATCH 4/6] Small fixes 2 --- src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs index c12538b2..bf3572b7 100644 --- a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs +++ b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs @@ -10,8 +10,9 @@ module Bitonic = let localSize = Common.Utils.floorToPower2 ( int (clContext.ClDevice.LocalMemSize) - / (sizeof + sizeof<'a>) + / (sizeof + sizeof<'a>) ) + / 2 let maxThreadsPerBlock = min (clContext.ClDevice.MaxWorkGroupSize) (localSize / 2) From 050df59fecc500a1de5c8115def8dc9c6e310393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Sun, 2 Jun 2024 19:31:41 +0300 Subject: [PATCH 5/6] toSparse unsorted and tests --- src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs | 3 +- .../Vector/Dense/Vector.fs | 56 +++++++++++++++++++ src/GraphBLAS-sharp.Backend/Vector/Vector.fs | 20 +++++++ .../Backend/Vector/Convert.fs | 34 +++++++++-- 4 files changed, 107 insertions(+), 6 deletions(-) diff --git a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs index f5387bb0..2b9a3a83 100644 --- a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs +++ b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs @@ -148,7 +148,8 @@ module internal BFS = let fillSubVectorInPlace = Vector.assignByMaskInPlace (Mask.assign) clContext workGroupSize - let toSparse = Vector.toSparse clContext workGroupSize + let toSparse = + Vector.toSparseUnsorted clContext workGroupSize let toDense = Vector.toDense clContext workGroupSize diff --git a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs index 36dc6835..6d910afd 100644 --- a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs +++ b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs @@ -140,7 +140,63 @@ module Vector = valueCell.Free() + // TODO: toSparseUnsorted + bitonic probably would work faster let toSparse<'a when 'a: struct> (clContext: ClContext) workGroupSize = + let scatterValues = + Common.Scatter.lastOccurrence clContext workGroupSize + + let scatterIndices = + Common.Scatter.lastOccurrence clContext workGroupSize + + let getBitmap = + Map.map (Map.option 1 0) clContext workGroupSize + + let prefixSum = + Common.PrefixSum.standardExcludeInPlace clContext workGroupSize + + let allIndices = + ClArray.init Map.id clContext workGroupSize + + let allValues = + Map.map (Map.optionToValueOrZero Unchecked.defaultof<'a>) clContext workGroupSize + + fun (processor: RawCommandQueue) allocationMode (vector: ClArray<'a option>) -> + + let positions = getBitmap processor DeviceOnly vector + + let resultLength = + (prefixSum processor positions) + .ToHostAndFree(processor) + + // compute result indices + let resultIndices = + clContext.CreateClArrayWithSpecificAllocationMode(allocationMode, resultLength) + + let allIndices = + allIndices processor DeviceOnly vector.Length + + scatterIndices processor positions allIndices resultIndices + + allIndices.Free() + + // compute result values + let resultValues = + clContext.CreateClArrayWithSpecificAllocationMode<'a>(allocationMode, resultLength) + + let allValues = allValues processor DeviceOnly vector + + scatterValues processor positions allValues resultValues + + allValues.Free() + + positions.Free() + + { Context = clContext + Indices = resultIndices + Values = resultValues + Size = vector.Length } + + let toSparseUnsorted<'a when 'a: struct> (clContext: ClContext) workGroupSize = let kernel = <@ fun (ndRange: Range1D) (inputLength: int) (inputValues: ClArray<'a option>) (resultSize: ClCell) (resultIndices: ClArray) (resultValues: ClArray<'a>) -> diff --git a/src/GraphBLAS-sharp.Backend/Vector/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Vector.fs index 0d10dd08..1c9f05c4 100644 --- a/src/GraphBLAS-sharp.Backend/Vector/Vector.fs +++ b/src/GraphBLAS-sharp.Backend/Vector/Vector.fs @@ -129,6 +129,26 @@ module Vector = <| toSparse processor allocationMode vector | ClVector.Sparse _ -> copy processor allocationMode vector + /// + /// Sparsifies the given vector if it is in a dense format. + /// If the given vector is already sparse, copies it. + /// Works faster than regular version, but indices of the sparse vector are unsorted. + /// + /// OpenCL context. + /// Should be a power of 2 and greater than 1. + let toSparseUnsorted (clContext: ClContext) workGroupSize = + let toSparse = + Dense.Vector.toSparseUnsorted clContext workGroupSize + + let copy = copy clContext workGroupSize + + fun (processor: RawCommandQueue) allocationMode (vector: ClVector<'a>) -> + match vector with + | ClVector.Dense vector -> + ClVector.Sparse + <| toSparse processor allocationMode vector + | ClVector.Sparse _ -> copy processor allocationMode vector + /// /// Densifies the given vector if it is in a sparse format. /// If the given vector is already dense, copies it. diff --git a/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs b/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs index a755be58..a53691e1 100644 --- a/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs +++ b/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs @@ -22,6 +22,7 @@ let wgSize = Constants.Common.defaultWorkGroupSize let makeTest formatFrom (convertFun: RawCommandQueue -> AllocationFlag -> ClVector<'a> -> ClVector<'a>) + (convertFunUnsorted: option AllocationFlag -> ClVector<'a> -> ClVector<'a>>) isZero case (array: 'a []) @@ -37,7 +38,7 @@ let makeTest let actual = let clVector = vector.ToDevice context - let convertedVector = convertFun q HostInterop clVector + let convertedVector = convertFun q DeviceOnly clVector let res = convertedVector.ToHost q @@ -56,6 +57,27 @@ let makeTest Expect.equal actual expected "Vectors must be the same" + match convertFunUnsorted with + | None -> () + | Some convertFunUnsorted -> + let clVector = vector.ToDevice context + let convertedVector = convertFunUnsorted q DeviceOnly clVector + + let res = convertedVector.ToHost q + + match res, expected with + | Vector.Sparse res, Vector.Sparse expected -> + let iv = Array.zip res.Indices res.Values + let resSorted = Array.sortBy (fun (i, v) -> i) iv + let indices, values = Array.unzip resSorted + Expect.equal indices expected.Indices "Indices must be the same" + Expect.equal values expected.Values "Values must be the same" + Expect.equal res.Size expected.Size "Size must be the same" + | _ -> () + + clVector.Dispose() + convertedVector.Dispose() + let testFixtures case = let getCorrectnessTestName datatype formatFrom = sprintf $"Correctness on %s{datatype}, %A{formatFrom} -> %A{case.Format}" @@ -68,19 +90,21 @@ let testFixtures case = match case.Format with | Sparse -> [ let convertFun = Vector.toSparse context wgSize + let convertFunUnsorted = Vector.toSparseUnsorted context wgSize Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) 0) case + makeTest formatFrom convertFun (Some convertFunUnsorted) ((=) 0) case |> testPropertyWithConfig config (getCorrectnessTestName "int" formatFrom)) let convertFun = Vector.toSparse context wgSize + let convertFunUnsorted = Vector.toSparseUnsorted context wgSize Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) false) case + makeTest formatFrom convertFun (Some convertFunUnsorted) ((=) false) case |> testPropertyWithConfig config (getCorrectnessTestName "bool" formatFrom)) ] |> List.concat | Dense -> @@ -89,7 +113,7 @@ let testFixtures case = Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) 0) case + makeTest formatFrom convertFun None ((=) 0) case |> testPropertyWithConfig config (getCorrectnessTestName "int" formatFrom)) let convertFun = Vector.toDense context wgSize @@ -97,7 +121,7 @@ let testFixtures case = Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) false) case + makeTest formatFrom convertFun None ((=) false) case |> testPropertyWithConfig config (getCorrectnessTestName "bool" formatFrom)) ] |> List.concat From 12e733f3e456891b5910e9d18376d4768c04daab Mon Sep 17 00:00:00 2001 From: Kirill <71129570+kirillgarbar@users.noreply.github.com> Date: Sun, 2 Jun 2024 20:31:36 +0300 Subject: [PATCH 6/6] Fix bitonic localSize --- src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs index bf3572b7..11b6b62e 100644 --- a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs +++ b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs @@ -10,7 +10,7 @@ module Bitonic = let localSize = Common.Utils.floorToPower2 ( int (clContext.ClDevice.LocalMemSize) - / (sizeof + sizeof<'a>) + / (sizeof + sizeof<'a>) ) / 2