diff --git a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs index 2c2e3011..2b9a3a83 100644 --- a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs +++ b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs @@ -135,7 +135,7 @@ module internal BFS = Operations.SpMVInPlace add mul clContext workGroupSize let spMSpV = - Operations.SpMSpVBool add mul clContext workGroupSize + Operations.SpMSpVMaskedBool add mul clContext workGroupSize let zeroCreate = Vector.zeroCreate clContext workGroupSize @@ -145,13 +145,11 @@ module internal BFS = let maskComplementedInPlace = Vector.map2InPlace Mask.complementedOp clContext workGroupSize - let maskComplemented = - Vector.map2Sparse Mask.complementedOp clContext workGroupSize - let fillSubVectorInPlace = Vector.assignByMaskInPlace (Mask.assign) clContext workGroupSize - let toSparse = Vector.toSparse clContext workGroupSize + let toSparse = + Vector.toSparseUnsorted clContext workGroupSize let toDense = Vector.toDense clContext workGroupSize @@ -190,28 +188,21 @@ module internal BFS = match frontier with | ClVector.Sparse _ -> //Getting new frontier - match spMSpV queue matrix frontier with + match spMSpV queue matrix frontier levels with | None -> frontier.Dispose() stop <- true - | Some newFrontier -> + | Some newMaskedFrontier -> frontier.Dispose() - //Filtering visited vertices - match maskComplemented queue DeviceOnly newFrontier levels with - | None -> - stop <- true - newFrontier.Dispose() - | Some newMaskedFrontier -> - newFrontier.Dispose() - - //Push/pull - let NNZ = getNNZ queue newMaskedFrontier - - if (push NNZ newMaskedFrontier.Size) then - frontier <- newMaskedFrontier - else - frontier <- toDense queue DeviceOnly newMaskedFrontier - newMaskedFrontier.Dispose() + + //Push/pull + let NNZ = getNNZ queue newMaskedFrontier + + if (push NNZ newMaskedFrontier.Size) then + frontier <- newMaskedFrontier + else + frontier <- toDense queue DeviceOnly newMaskedFrontier + newMaskedFrontier.Dispose() | ClVector.Dense oldFrontier -> //Getting new frontier spMVInPlace queue matrix frontier frontier diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs index 9c21781e..bc324850 100644 --- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs +++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs @@ -362,7 +362,7 @@ module ClArray = let gid = ndRange.GlobalID0 - if gid < length then + if gid < length && not result.Value then let isExist = (%predicate) vector.[gid] if isExist then result.Value <- true @> @@ -902,22 +902,40 @@ module ClArray = let count<'a> (predicate: Expr<'a -> bool>) (clContext: ClContext) workGroupSize = - let sum = - Reduce.reduce <@ (+) @> clContext workGroupSize + let count = + <@ fun (ndRange: Range1D) (length: int) (array: ClArray<'a>) (count: ClCell) -> + let gid = ndRange.GlobalID0 + let mutable countLocal = 0 + let step = ndRange.GlobalWorkSize + + let mutable i = gid - let getBitmap = - Map.map<'a, int> (Map.predicateBitmap predicate) clContext workGroupSize + while i < length do + let res = (%predicate) array.[i] + if res then countLocal <- countLocal + 1 + i <- i + step + + atomic (+) count.Value countLocal |> ignore @> + + let count = clContext.Compile count fun (processor: RawCommandQueue) (array: ClArray<'a>) -> - let bitmap = getBitmap processor DeviceOnly array + let result = clContext.CreateClCell(0) - let result = - (sum processor bitmap).ToHostAndFree processor + let numberOfGroups = + Utils.divUpClamp array.Length workGroupSize 1 1024 - bitmap.Free() + let ndRange = + Range1D.CreateValid(workGroupSize * numberOfGroups, workGroupSize) - result + let kernel = count.GetKernel() + + kernel.KernelFunc ndRange array.Length array result + + processor.RunKernel kernel + + result.ToHostAndFree processor /// /// Builds a new array whose elements are the results of applying the given function diff --git a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs index 724de235..11b6b62e 100644 --- a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs +++ b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs @@ -12,6 +12,7 @@ module Bitonic = int (clContext.ClDevice.LocalMemSize) / (sizeof + sizeof<'a>) ) + / 2 let maxThreadsPerBlock = min (clContext.ClDevice.MaxWorkGroupSize) (localSize / 2) @@ -257,6 +258,7 @@ module Bitonic = int (clContext.ClDevice.LocalMemSize) / (sizeof + sizeof<'a>) ) + / 2 let maxThreadsPerBlock = min (clContext.ClDevice.MaxWorkGroupSize) (localSize / 2) @@ -476,4 +478,4 @@ module Bitonic = kernelGlobal.KernelFunc ndRangeGlobal rows values values.Length (localSize * 2) - queue.RunKernel(kernelGlobal) \ No newline at end of file + queue.RunKernel(kernelGlobal) diff --git a/src/GraphBLAS-sharp.Backend/Common/Utils.fs b/src/GraphBLAS-sharp.Backend/Common/Utils.fs index 3ef10555..ef4c3371 100644 --- a/src/GraphBLAS-sharp.Backend/Common/Utils.fs +++ b/src/GraphBLAS-sharp.Backend/Common/Utils.fs @@ -19,6 +19,10 @@ module internal Utils = >> fun x -> x ||| (x >>> 16) >> fun x -> x + 1 + let divUp x y = x / y + (if x % y = 0 then 0 else 1) + + let divUpClamp x y left right = min (max (divUp x y) left) right + let floorToMultiple multiple x = x / multiple * multiple let ceilToMultiple multiple x = ((x - 1) / multiple + 1) * multiple diff --git a/src/GraphBLAS-sharp.Backend/Operations/Operations.fs b/src/GraphBLAS-sharp.Backend/Operations/Operations.fs index ff06f1d5..bc0f2c1d 100644 --- a/src/GraphBLAS-sharp.Backend/Operations/Operations.fs +++ b/src/GraphBLAS-sharp.Backend/Operations/Operations.fs @@ -331,7 +331,7 @@ module Operations = | _ -> failwith "Not implemented yet" /// - /// CSR Matrix - sparse vector multiplication. Optimized for bool OR and AND operations. + /// CSR Matrix - sparse vector multiplication. Optimized for bool OR and AND operations by skipping reduction stage. /// /// Type of binary function to reduce entries. /// Type of binary function to combine entries. @@ -352,6 +352,50 @@ module Operations = | ClMatrix.CSR m, ClVector.Sparse v -> Option.map ClVector.Sparse (run queue m v) | _ -> failwith "Not implemented yet" + /// + /// CSR Matrix - sparse vector multiplication with mask. Mask is complemented. + /// + /// Type of binary function to reduce entries. + /// Type of binary function to combine entries. + /// OpenCL context. + /// Should be a power of 2 and greater than 1. + let SpMSpVMasked + (add: Expr<'c option -> 'c option -> 'c option>) + (mul: Expr<'a option -> 'b option -> 'c option>) + (clContext: ClContext) + workGroupSize + = + + let run = + SpMSpV.Masked.runMasked add mul clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix<'a>) (vector: ClVector<'b>) (mask: ClVector<'d>) -> + match matrix, vector, mask with + | ClMatrix.CSR m, ClVector.Sparse v, ClVector.Dense mask -> Option.map ClVector.Sparse (run queue m v mask) + | _ -> failwith "Not implemented yet" + + /// + /// CSR Matrix - sparse vector multiplication with mask. Mask is complemented. Optimized for bool OR and AND operations by skipping reduction stage. + /// + /// Type of binary function to reduce entries. + /// Type of binary function to combine entries. + /// OpenCL context. + /// Should be a power of 2 and greater than 1. + let SpMSpVMaskedBool + (add: Expr bool option -> bool option>) + (mul: Expr bool option -> bool option>) + (clContext: ClContext) + workGroupSize + = + + let run = + SpMSpV.Masked.runMaskedBoolStandard add mul clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix<'a>) (vector: ClVector<'b>) (mask: ClVector<'d>) -> + match matrix, vector, mask with + | ClMatrix.CSR m, ClVector.Sparse v, ClVector.Dense mask -> Option.map ClVector.Sparse (run queue m v mask) + | _ -> failwith "Not implemented yet" + /// /// CSR Matrix - sparse vector multiplication. /// diff --git a/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs b/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs index e4f61fea..3f6f0908 100644 --- a/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs +++ b/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs @@ -290,3 +290,218 @@ module SpMSpV = Indices = resultIndices Values = create queue DeviceOnly resultIndices.Length true Size = matrix.ColumnCount }) + + module Masked = + + let private count (clContext: ClContext) workGroupSize = + + let count = + <@ fun (ndRange: Range1D) vectorLength (vectorIndices: ClArray) (vectorMask: ClArray<'d option>) (matrixRowPointers: ClArray) (matrixColumns: ClArray) (result: ClCell) -> + let gid = ndRange.GlobalID0 + let step = ndRange.GlobalWorkSize + + let mutable idx = gid + + while idx < vectorLength do + let vectorIndex = vectorIndices.[idx] + + let rowStart = matrixRowPointers.[vectorIndex] + let rowEnd = matrixRowPointers.[vectorIndex + 1] + + let mutable count = 0 + + for i in rowStart .. rowEnd - 1 do + match vectorMask.[matrixColumns.[i]] with + | None -> count <- count + 1 + | Some _ -> () + + atomic (+) result.Value count |> ignore + + idx <- idx + step @> + + let count = clContext.Compile count + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (vectorMask: ClArray<'d option>) -> + + let length = vector.NNZ + + let numberOfGroups = + Utils.divUpClamp length workGroupSize 1 1024 + + let result = clContext.CreateClCell(0) + + let ndRange = + Range1D.CreateValid(numberOfGroups * workGroupSize, workGroupSize) + + let count = count.GetKernel() + + count.KernelFunc ndRange length vector.Indices vectorMask matrix.RowPointers matrix.Columns result + + queue.RunKernel count + + result.ToHostAndFree queue + + let private multiplyValues + (clContext: ClContext) + (mul: Expr<'a option -> 'b option -> 'c option>) + workGroupSize + = + + let multiply = + <@ fun (ndRange: Range1D) resultLength (vectorIndices: ClArray) (vectorValues: ClArray<'b>) (vectorMask: ClArray<'d option>) (matrixRowPointers: ClArray) (matrixColumns: ClArray) (matrixValues: ClArray<'a>) (resultOffset: ClCell) (resultIndices: ClArray) (resultValues: ClArray<'c option>) -> + let gid = ndRange.GlobalID0 + let step = ndRange.GlobalWorkSize + + let mutable i = gid + + while i < resultLength do + let vectorIndex = vectorIndices.[i] + let vectorValue = vectorValues.[i] + + let rowStart = matrixRowPointers.[vectorIndex] + let rowEnd = matrixRowPointers.[vectorIndex + 1] + + let mutable count = 0 + + for i in rowStart .. rowEnd - 1 do + match vectorMask.[matrixColumns.[i]] with + | None -> count <- count + 1 + | Some _ -> () + + let mutable offset = atomic (+) resultOffset.Value count + + for i in rowStart .. rowEnd - 1 do + let columnIndex = matrixColumns.[i] + + // TODO: Pass mask operation + match vectorMask.[columnIndex] with + | None -> + resultIndices.[offset] <- columnIndex + resultValues.[offset] <- (%mul) (Some matrixValues.[i]) (Some vectorValue) + offset <- offset + 1 + | Some _ -> () + + i <- i + step @> + + let kernel = clContext.Compile multiply + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (vectorMask: ClArray<'d option>) (resultSize: int) -> + + let multipliedIndices = + clContext.CreateClArrayWithSpecificAllocationMode(DeviceOnly, resultSize) + + let multipliedValues = + clContext.CreateClArrayWithSpecificAllocationMode<'c option>(DeviceOnly, resultSize) + + let offset = clContext.CreateClCell 0 + + let numberOfGroups = + Utils.divUpClamp vector.NNZ workGroupSize 1 1024 + + let ndRange = + Range1D.CreateValid(numberOfGroups * workGroupSize, workGroupSize) + + let kernel = kernel.GetKernel() + + kernel.KernelFunc + ndRange + vector.NNZ + vector.Indices + vector.Values + vectorMask + matrix.RowPointers + matrix.Columns + matrix.Values + offset + multipliedIndices + multipliedValues + + queue.RunKernel kernel + + offset.Free() + + multipliedIndices, multipliedValues + + let runMasked + (add: Expr<'c option -> 'c option -> 'c option>) + (mul: Expr<'a option -> 'b option -> 'c option>) + (clContext: ClContext) + workGroupSize + = + + let count = count clContext workGroupSize + + let multiplyValues = + multiplyValues clContext mul workGroupSize + + let sort = + Sort.Bitonic.sortKeyValuesInplace clContext workGroupSize + + let segReduce = + Reduce.ByKey.Option.segmentSequential add clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (mask: ClArray<'d option>) -> + + match count queue matrix vector mask with + | 0 -> None + | resultSize -> + let multipliedIndices, multipliedValues = + multiplyValues queue matrix vector mask resultSize + + sort queue multipliedIndices multipliedValues + + let result = + segReduce queue DeviceOnly multipliedIndices multipliedValues + |> Option.map + (fun (reducedValues, reducedKeys) -> + { Context = clContext + Indices = reducedKeys + Values = reducedValues + Size = matrix.ColumnCount }) + + multipliedIndices.Free() + multipliedValues.Free() + + result + + let runMaskedBoolStandard + (add: Expr<'c option -> 'c option -> 'c option>) + (mul: Expr<'a option -> 'b option -> 'c option>) + (clContext: ClContext) + workGroupSize + = + + let count = count clContext workGroupSize + + let multiplyValues = + multiplyValues clContext mul workGroupSize + + let sort = + Sort.Bitonic.sortKeyValuesInplace clContext workGroupSize + + let removeDuplicates = + GraphBLAS.FSharp.ClArray.removeDuplications clContext workGroupSize + + let create = + GraphBLAS.FSharp.ClArray.create clContext workGroupSize + + fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (mask: ClArray<'d option>) -> + + match count queue matrix vector mask with + | 0 -> None + | resultSize -> + let multipliedIndices, multipliedValues = + multiplyValues queue matrix vector mask resultSize + + sort queue multipliedIndices multipliedValues + + let resultIndices = removeDuplicates queue multipliedIndices + + multipliedIndices.Free() + multipliedValues.Free() + + Some + <| { Context = clContext + Indices = resultIndices + Values = create queue DeviceOnly resultIndices.Length true + Size = matrix.ColumnCount } diff --git a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs index 36a74c67..6d910afd 100644 --- a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs +++ b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs @@ -140,8 +140,8 @@ module Vector = valueCell.Free() + // TODO: toSparseUnsorted + bitonic probably would work faster let toSparse<'a when 'a: struct> (clContext: ClContext) workGroupSize = - let scatterValues = Common.Scatter.lastOccurrence clContext workGroupSize @@ -196,7 +196,7 @@ module Vector = Values = resultValues Size = vector.Length } - let toSparse2<'a when 'a: struct> (clContext: ClContext) workGroupSize = + let toSparseUnsorted<'a when 'a: struct> (clContext: ClContext) workGroupSize = let kernel = <@ fun (ndRange: Range1D) (inputLength: int) (inputValues: ClArray<'a option>) (resultSize: ClCell) (resultIndices: ClArray) (resultValues: ClArray<'a>) -> diff --git a/src/GraphBLAS-sharp.Backend/Vector/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Vector.fs index 0d10dd08..1c9f05c4 100644 --- a/src/GraphBLAS-sharp.Backend/Vector/Vector.fs +++ b/src/GraphBLAS-sharp.Backend/Vector/Vector.fs @@ -129,6 +129,26 @@ module Vector = <| toSparse processor allocationMode vector | ClVector.Sparse _ -> copy processor allocationMode vector + /// + /// Sparsifies the given vector if it is in a dense format. + /// If the given vector is already sparse, copies it. + /// Works faster than regular version, but indices of the sparse vector are unsorted. + /// + /// OpenCL context. + /// Should be a power of 2 and greater than 1. + let toSparseUnsorted (clContext: ClContext) workGroupSize = + let toSparse = + Dense.Vector.toSparseUnsorted clContext workGroupSize + + let copy = copy clContext workGroupSize + + fun (processor: RawCommandQueue) allocationMode (vector: ClVector<'a>) -> + match vector with + | ClVector.Dense vector -> + ClVector.Sparse + <| toSparse processor allocationMode vector + | ClVector.Sparse _ -> copy processor allocationMode vector + /// /// Densifies the given vector if it is in a sparse format. /// If the given vector is already dense, copies it. diff --git a/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs b/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs index a755be58..a53691e1 100644 --- a/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs +++ b/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs @@ -22,6 +22,7 @@ let wgSize = Constants.Common.defaultWorkGroupSize let makeTest formatFrom (convertFun: RawCommandQueue -> AllocationFlag -> ClVector<'a> -> ClVector<'a>) + (convertFunUnsorted: option AllocationFlag -> ClVector<'a> -> ClVector<'a>>) isZero case (array: 'a []) @@ -37,7 +38,7 @@ let makeTest let actual = let clVector = vector.ToDevice context - let convertedVector = convertFun q HostInterop clVector + let convertedVector = convertFun q DeviceOnly clVector let res = convertedVector.ToHost q @@ -56,6 +57,27 @@ let makeTest Expect.equal actual expected "Vectors must be the same" + match convertFunUnsorted with + | None -> () + | Some convertFunUnsorted -> + let clVector = vector.ToDevice context + let convertedVector = convertFunUnsorted q DeviceOnly clVector + + let res = convertedVector.ToHost q + + match res, expected with + | Vector.Sparse res, Vector.Sparse expected -> + let iv = Array.zip res.Indices res.Values + let resSorted = Array.sortBy (fun (i, v) -> i) iv + let indices, values = Array.unzip resSorted + Expect.equal indices expected.Indices "Indices must be the same" + Expect.equal values expected.Values "Values must be the same" + Expect.equal res.Size expected.Size "Size must be the same" + | _ -> () + + clVector.Dispose() + convertedVector.Dispose() + let testFixtures case = let getCorrectnessTestName datatype formatFrom = sprintf $"Correctness on %s{datatype}, %A{formatFrom} -> %A{case.Format}" @@ -68,19 +90,21 @@ let testFixtures case = match case.Format with | Sparse -> [ let convertFun = Vector.toSparse context wgSize + let convertFunUnsorted = Vector.toSparseUnsorted context wgSize Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) 0) case + makeTest formatFrom convertFun (Some convertFunUnsorted) ((=) 0) case |> testPropertyWithConfig config (getCorrectnessTestName "int" formatFrom)) let convertFun = Vector.toSparse context wgSize + let convertFunUnsorted = Vector.toSparseUnsorted context wgSize Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) false) case + makeTest formatFrom convertFun (Some convertFunUnsorted) ((=) false) case |> testPropertyWithConfig config (getCorrectnessTestName "bool" formatFrom)) ] |> List.concat | Dense -> @@ -89,7 +113,7 @@ let testFixtures case = Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) 0) case + makeTest formatFrom convertFun None ((=) 0) case |> testPropertyWithConfig config (getCorrectnessTestName "int" formatFrom)) let convertFun = Vector.toDense context wgSize @@ -97,7 +121,7 @@ let testFixtures case = Utils.listOfUnionCases |> List.map (fun formatFrom -> - makeTest formatFrom convertFun ((=) false) case + makeTest formatFrom convertFun None ((=) false) case |> testPropertyWithConfig config (getCorrectnessTestName "bool" formatFrom)) ] |> List.concat