From a4da73dc43c25800b944b43695df23878f9634c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Sat, 1 Jun 2024 19:18:16 +0300
Subject: [PATCH 1/6] Count atomic

---
 src/GraphBLAS-sharp.Backend/Common/ClArray.fs | 36 ++++++++++++++-----
 src/GraphBLAS-sharp.Backend/Common/Utils.fs   |  4 +++
 2 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
index 9c21781e..7c3d5594 100644
--- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
@@ -902,22 +902,40 @@ module ClArray =
 
     let count<'a> (predicate: Expr<'a -> bool>) (clContext: ClContext) workGroupSize =
 
-        let sum =
-            Reduce.reduce <@ (+) @> clContext workGroupSize
+        let count =
+            <@ fun (ndRange: Range1D) (length: int) (array: ClArray<'a>) (count: ClCell<int>) ->
+                let gid = ndRange.GlobalID0
+                let mutable countLocal = 0
+                let gSize = ndRange.GlobalWorkSize
 
-        let getBitmap =
-            Map.map<'a, int> (Map.predicateBitmap predicate) clContext workGroupSize
+                let mutable i = gid
+
+                while i < length do
+                    let res = (%predicate) array.[i]
+                    if res then countLocal <- countLocal + 1
+                    i <- i + gSize
+
+                atomic (+) count.Value countLocal |> ignore @>
+
+        let count = clContext.Compile count
 
         fun (processor: RawCommandQueue) (array: ClArray<'a>) ->
 
-            let bitmap = getBitmap processor DeviceOnly array
+            let result = clContext.CreateClCell<int>(0)
 
-            let result =
-                (sum processor bitmap).ToHostAndFree processor
+            let numberOfGroups =
+                Utils.divUpClamp array.Length workGroupSize 1 1024
 
-            bitmap.Free()
+            let ndRange =
+                Range1D.CreateValid(workGroupSize * numberOfGroups, workGroupSize)
 
-            result
+            let kernel = count.GetKernel()
+
+            kernel.KernelFunc ndRange array.Length array result
+
+            processor.RunKernel kernel
+
+            result.ToHostAndFree processor
 
     /// <summary>
     /// Builds a new array whose elements are the results of applying the given function
diff --git a/src/GraphBLAS-sharp.Backend/Common/Utils.fs b/src/GraphBLAS-sharp.Backend/Common/Utils.fs
index 3ef10555..ef4c3371 100644
--- a/src/GraphBLAS-sharp.Backend/Common/Utils.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/Utils.fs
@@ -19,6 +19,10 @@ module internal Utils =
         >> fun x -> x ||| (x >>> 16)
         >> fun x -> x + 1
 
+    let divUp x y = x / y + (if x % y = 0 then 0 else 1)
+
+    let divUpClamp x y left right = min (max (divUp x y) left) right
+
     let floorToMultiple multiple x = x / multiple * multiple
 
     let ceilToMultiple multiple x = ((x - 1) / multiple + 1) * multiple

From b6e13fd5af1cbc60f118b4053fdda032c45422dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Sat, 1 Jun 2024 19:54:17 +0300
Subject: [PATCH 2/6] SpMSpVMasked

---
 src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs |  34 +--
 src/GraphBLAS-sharp.Backend/Common/ClArray.fs |   4 +-
 .../Operations/Operations.fs                  |  46 +++-
 .../Operations/SpMSpV.fs                      | 215 ++++++++++++++++++
 .../Vector/Dense/Vector.fs                    |  56 -----
 5 files changed, 274 insertions(+), 81 deletions(-)

diff --git a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs
index 2c2e3011..f5387bb0 100644
--- a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs
+++ b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs
@@ -135,7 +135,7 @@ module internal BFS =
             Operations.SpMVInPlace add mul clContext workGroupSize
 
         let spMSpV =
-            Operations.SpMSpVBool add mul clContext workGroupSize
+            Operations.SpMSpVMaskedBool add mul clContext workGroupSize
 
         let zeroCreate =
             Vector.zeroCreate clContext workGroupSize
@@ -145,9 +145,6 @@ module internal BFS =
         let maskComplementedInPlace =
             Vector.map2InPlace Mask.complementedOp clContext workGroupSize
 
-        let maskComplemented =
-            Vector.map2Sparse Mask.complementedOp clContext workGroupSize
-
         let fillSubVectorInPlace =
             Vector.assignByMaskInPlace (Mask.assign) clContext workGroupSize
 
@@ -190,28 +187,21 @@ module internal BFS =
                 match frontier with
                 | ClVector.Sparse _ ->
                     //Getting new frontier
-                    match spMSpV queue matrix frontier with
+                    match spMSpV queue matrix frontier levels with
                     | None ->
                         frontier.Dispose()
                         stop <- true
-                    | Some newFrontier ->
+                    | Some newMaskedFrontier ->
                         frontier.Dispose()
-                        //Filtering visited vertices
-                        match maskComplemented queue DeviceOnly newFrontier levels with
-                        | None ->
-                            stop <- true
-                            newFrontier.Dispose()
-                        | Some newMaskedFrontier ->
-                            newFrontier.Dispose()
-
-                            //Push/pull
-                            let NNZ = getNNZ queue newMaskedFrontier
-
-                            if (push NNZ newMaskedFrontier.Size) then
-                                frontier <- newMaskedFrontier
-                            else
-                                frontier <- toDense queue DeviceOnly newMaskedFrontier
-                                newMaskedFrontier.Dispose()
+
+                        //Push/pull
+                        let NNZ = getNNZ queue newMaskedFrontier
+
+                        if (push NNZ newMaskedFrontier.Size) then
+                            frontier <- newMaskedFrontier
+                        else
+                            frontier <- toDense queue DeviceOnly newMaskedFrontier
+                            newMaskedFrontier.Dispose()
                 | ClVector.Dense oldFrontier ->
                     //Getting new frontier
                     spMVInPlace queue matrix frontier frontier
diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
index 7c3d5594..d48863ae 100644
--- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
@@ -906,14 +906,14 @@ module ClArray =
             <@ fun (ndRange: Range1D) (length: int) (array: ClArray<'a>) (count: ClCell<int>) ->
                 let gid = ndRange.GlobalID0
                 let mutable countLocal = 0
-                let gSize = ndRange.GlobalWorkSize
+                let step = ndRange.GlobalWorkSize
 
                 let mutable i = gid
 
                 while i < length do
                     let res = (%predicate) array.[i]
                     if res then countLocal <- countLocal + 1
-                    i <- i + gSize
+                    i <- i + step
 
                 atomic (+) count.Value countLocal |> ignore @>
 
diff --git a/src/GraphBLAS-sharp.Backend/Operations/Operations.fs b/src/GraphBLAS-sharp.Backend/Operations/Operations.fs
index ff06f1d5..bc0f2c1d 100644
--- a/src/GraphBLAS-sharp.Backend/Operations/Operations.fs
+++ b/src/GraphBLAS-sharp.Backend/Operations/Operations.fs
@@ -331,7 +331,7 @@ module Operations =
             | _ -> failwith "Not implemented yet"
 
     /// <summary>
-    /// CSR Matrix - sparse vector multiplication. Optimized for bool OR and AND operations.
+    /// CSR Matrix - sparse vector multiplication. Optimized for bool OR and AND operations by skipping reduction stage.
     /// </summary>
     /// <param name="add">Type of binary function to reduce entries.</param>
     /// <param name="mul">Type of binary function to combine entries.</param>
@@ -352,6 +352,50 @@ module Operations =
             | ClMatrix.CSR m, ClVector.Sparse v -> Option.map ClVector.Sparse (run queue m v)
             | _ -> failwith "Not implemented yet"
 
+    /// <summary>
+    /// CSR Matrix - sparse vector multiplication with mask. Mask is complemented.
+    /// </summary>
+    /// <param name="add">Type of binary function to reduce entries.</param>
+    /// <param name="mul">Type of binary function to combine entries.</param>
+    /// <param name="clContext">OpenCL context.</param>
+    /// <param name="workGroupSize">Should be a power of 2 and greater than 1.</param>
+    let SpMSpVMasked
+        (add: Expr<'c option -> 'c option -> 'c option>)
+        (mul: Expr<'a option -> 'b option -> 'c option>)
+        (clContext: ClContext)
+        workGroupSize
+        =
+
+        let run =
+            SpMSpV.Masked.runMasked add mul clContext workGroupSize
+
+        fun (queue: RawCommandQueue) (matrix: ClMatrix<'a>) (vector: ClVector<'b>) (mask: ClVector<'d>) ->
+            match matrix, vector, mask with
+            | ClMatrix.CSR m, ClVector.Sparse v, ClVector.Dense mask -> Option.map ClVector.Sparse (run queue m v mask)
+            | _ -> failwith "Not implemented yet"
+
+    /// <summary>
+    /// CSR Matrix - sparse vector multiplication with mask. Mask is complemented. Optimized for bool OR and AND operations by skipping reduction stage.
+    /// </summary>
+    /// <param name="add">Type of binary function to reduce entries.</param>
+    /// <param name="mul">Type of binary function to combine entries.</param>
+    /// <param name="clContext">OpenCL context.</param>
+    /// <param name="workGroupSize">Should be a power of 2 and greater than 1.</param>
+    let SpMSpVMaskedBool
+        (add: Expr<bool option -> bool option -> bool option>)
+        (mul: Expr<bool option -> bool option -> bool option>)
+        (clContext: ClContext)
+        workGroupSize
+        =
+
+        let run =
+            SpMSpV.Masked.runMaskedBoolStandard add mul clContext workGroupSize
+
+        fun (queue: RawCommandQueue) (matrix: ClMatrix<'a>) (vector: ClVector<'b>) (mask: ClVector<'d>) ->
+            match matrix, vector, mask with
+            | ClMatrix.CSR m, ClVector.Sparse v, ClVector.Dense mask -> Option.map ClVector.Sparse (run queue m v mask)
+            | _ -> failwith "Not implemented yet"
+
     /// <summary>
     /// CSR Matrix - sparse vector multiplication.
     /// </summary>
diff --git a/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs b/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs
index e4f61fea..3f6f0908 100644
--- a/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs
+++ b/src/GraphBLAS-sharp.Backend/Operations/SpMSpV.fs
@@ -290,3 +290,218 @@ module SpMSpV =
                       Indices = resultIndices
                       Values = create queue DeviceOnly resultIndices.Length true
                       Size = matrix.ColumnCount })
+
+    module Masked =
+
+        let private count (clContext: ClContext) workGroupSize =
+
+            let count =
+                <@ fun (ndRange: Range1D) vectorLength (vectorIndices: ClArray<int>) (vectorMask: ClArray<'d option>) (matrixRowPointers: ClArray<int>) (matrixColumns: ClArray<int>) (result: ClCell<int>) ->
+                    let gid = ndRange.GlobalID0
+                    let step = ndRange.GlobalWorkSize
+
+                    let mutable idx = gid
+
+                    while idx < vectorLength do
+                        let vectorIndex = vectorIndices.[idx]
+
+                        let rowStart = matrixRowPointers.[vectorIndex]
+                        let rowEnd = matrixRowPointers.[vectorIndex + 1]
+
+                        let mutable count = 0
+
+                        for i in rowStart .. rowEnd - 1 do
+                            match vectorMask.[matrixColumns.[i]] with
+                            | None -> count <- count + 1
+                            | Some _ -> ()
+
+                        atomic (+) result.Value count |> ignore
+
+                        idx <- idx + step @>
+
+            let count = clContext.Compile count
+
+            fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (vectorMask: ClArray<'d option>) ->
+
+                let length = vector.NNZ
+
+                let numberOfGroups =
+                    Utils.divUpClamp length workGroupSize 1 1024
+
+                let result = clContext.CreateClCell(0)
+
+                let ndRange =
+                    Range1D.CreateValid(numberOfGroups * workGroupSize, workGroupSize)
+
+                let count = count.GetKernel()
+
+                count.KernelFunc ndRange length vector.Indices vectorMask matrix.RowPointers matrix.Columns result
+
+                queue.RunKernel count
+
+                result.ToHostAndFree queue
+
+        let private multiplyValues
+            (clContext: ClContext)
+            (mul: Expr<'a option -> 'b option -> 'c option>)
+            workGroupSize
+            =
+
+            let multiply =
+                <@ fun (ndRange: Range1D) resultLength (vectorIndices: ClArray<int>) (vectorValues: ClArray<'b>) (vectorMask: ClArray<'d option>) (matrixRowPointers: ClArray<int>) (matrixColumns: ClArray<int>) (matrixValues: ClArray<'a>) (resultOffset: ClCell<int>) (resultIndices: ClArray<int>) (resultValues: ClArray<'c option>) ->
+                    let gid = ndRange.GlobalID0
+                    let step = ndRange.GlobalWorkSize
+
+                    let mutable i = gid
+
+                    while i < resultLength do
+                        let vectorIndex = vectorIndices.[i]
+                        let vectorValue = vectorValues.[i]
+
+                        let rowStart = matrixRowPointers.[vectorIndex]
+                        let rowEnd = matrixRowPointers.[vectorIndex + 1]
+
+                        let mutable count = 0
+
+                        for i in rowStart .. rowEnd - 1 do
+                            match vectorMask.[matrixColumns.[i]] with
+                            | None -> count <- count + 1
+                            | Some _ -> ()
+
+                        let mutable offset = atomic (+) resultOffset.Value count
+
+                        for i in rowStart .. rowEnd - 1 do
+                            let columnIndex = matrixColumns.[i]
+
+                            // TODO: Pass mask operation
+                            match vectorMask.[columnIndex] with
+                            | None ->
+                                resultIndices.[offset] <- columnIndex
+                                resultValues.[offset] <- (%mul) (Some matrixValues.[i]) (Some vectorValue)
+                                offset <- offset + 1
+                            | Some _ -> ()
+
+                        i <- i + step @>
+
+            let kernel = clContext.Compile multiply
+
+            fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (vectorMask: ClArray<'d option>) (resultSize: int) ->
+
+                let multipliedIndices =
+                    clContext.CreateClArrayWithSpecificAllocationMode<int>(DeviceOnly, resultSize)
+
+                let multipliedValues =
+                    clContext.CreateClArrayWithSpecificAllocationMode<'c option>(DeviceOnly, resultSize)
+
+                let offset = clContext.CreateClCell 0
+
+                let numberOfGroups =
+                    Utils.divUpClamp vector.NNZ workGroupSize 1 1024
+
+                let ndRange =
+                    Range1D.CreateValid(numberOfGroups * workGroupSize, workGroupSize)
+
+                let kernel = kernel.GetKernel()
+
+                kernel.KernelFunc
+                    ndRange
+                    vector.NNZ
+                    vector.Indices
+                    vector.Values
+                    vectorMask
+                    matrix.RowPointers
+                    matrix.Columns
+                    matrix.Values
+                    offset
+                    multipliedIndices
+                    multipliedValues
+
+                queue.RunKernel kernel
+
+                offset.Free()
+
+                multipliedIndices, multipliedValues
+
+        let runMasked
+            (add: Expr<'c option -> 'c option -> 'c option>)
+            (mul: Expr<'a option -> 'b option -> 'c option>)
+            (clContext: ClContext)
+            workGroupSize
+            =
+
+            let count = count clContext workGroupSize
+
+            let multiplyValues =
+                multiplyValues clContext mul workGroupSize
+
+            let sort =
+                Sort.Bitonic.sortKeyValuesInplace clContext workGroupSize
+
+            let segReduce =
+                Reduce.ByKey.Option.segmentSequential add clContext workGroupSize
+
+            fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (mask: ClArray<'d option>) ->
+
+                match count queue matrix vector mask with
+                | 0 -> None
+                | resultSize ->
+                    let multipliedIndices, multipliedValues =
+                        multiplyValues queue matrix vector mask resultSize
+
+                    sort queue multipliedIndices multipliedValues
+
+                    let result =
+                        segReduce queue DeviceOnly multipliedIndices multipliedValues
+                        |> Option.map
+                            (fun (reducedValues, reducedKeys) ->
+                                { Context = clContext
+                                  Indices = reducedKeys
+                                  Values = reducedValues
+                                  Size = matrix.ColumnCount })
+
+                    multipliedIndices.Free()
+                    multipliedValues.Free()
+
+                    result
+
+        let runMaskedBoolStandard
+            (add: Expr<'c option -> 'c option -> 'c option>)
+            (mul: Expr<'a option -> 'b option -> 'c option>)
+            (clContext: ClContext)
+            workGroupSize
+            =
+
+            let count = count clContext workGroupSize
+
+            let multiplyValues =
+                multiplyValues clContext mul workGroupSize
+
+            let sort =
+                Sort.Bitonic.sortKeyValuesInplace clContext workGroupSize
+
+            let removeDuplicates =
+                GraphBLAS.FSharp.ClArray.removeDuplications clContext workGroupSize
+
+            let create =
+                GraphBLAS.FSharp.ClArray.create clContext workGroupSize
+
+            fun (queue: RawCommandQueue) (matrix: ClMatrix.CSR<'a>) (vector: ClVector.Sparse<'b>) (mask: ClArray<'d option>) ->
+
+                match count queue matrix vector mask with
+                | 0 -> None
+                | resultSize ->
+                    let multipliedIndices, multipliedValues =
+                        multiplyValues queue matrix vector mask resultSize
+
+                    sort queue multipliedIndices multipliedValues
+
+                    let resultIndices = removeDuplicates queue multipliedIndices
+
+                    multipliedIndices.Free()
+                    multipliedValues.Free()
+
+                    Some
+                    <| { Context = clContext
+                         Indices = resultIndices
+                         Values = create queue DeviceOnly resultIndices.Length true
+                         Size = matrix.ColumnCount }
diff --git a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs
index 36a74c67..36dc6835 100644
--- a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs
+++ b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs
@@ -142,62 +142,6 @@ module Vector =
 
     let toSparse<'a when 'a: struct> (clContext: ClContext) workGroupSize =
 
-        let scatterValues =
-            Common.Scatter.lastOccurrence clContext workGroupSize
-
-        let scatterIndices =
-            Common.Scatter.lastOccurrence clContext workGroupSize
-
-        let getBitmap =
-            Map.map (Map.option 1 0) clContext workGroupSize
-
-        let prefixSum =
-            Common.PrefixSum.standardExcludeInPlace clContext workGroupSize
-
-        let allIndices =
-            ClArray.init Map.id clContext workGroupSize
-
-        let allValues =
-            Map.map (Map.optionToValueOrZero Unchecked.defaultof<'a>) clContext workGroupSize
-
-        fun (processor: RawCommandQueue) allocationMode (vector: ClArray<'a option>) ->
-
-            let positions = getBitmap processor DeviceOnly vector
-
-            let resultLength =
-                (prefixSum processor positions)
-                    .ToHostAndFree(processor)
-
-            // compute result indices
-            let resultIndices =
-                clContext.CreateClArrayWithSpecificAllocationMode<int>(allocationMode, resultLength)
-
-            let allIndices =
-                allIndices processor DeviceOnly vector.Length
-
-            scatterIndices processor positions allIndices resultIndices
-
-            allIndices.Free()
-
-            // compute result values
-            let resultValues =
-                clContext.CreateClArrayWithSpecificAllocationMode<'a>(allocationMode, resultLength)
-
-            let allValues = allValues processor DeviceOnly vector
-
-            scatterValues processor positions allValues resultValues
-
-            allValues.Free()
-
-            positions.Free()
-
-            { Context = clContext
-              Indices = resultIndices
-              Values = resultValues
-              Size = vector.Length }
-
-    let toSparse2<'a when 'a: struct> (clContext: ClContext) workGroupSize =
-
         let kernel =
             <@ fun (ndRange: Range1D) (inputLength: int) (inputValues: ClArray<'a option>) (resultSize: ClCell<int>) (resultIndices: ClArray<int>) (resultValues: ClArray<'a>) ->
 

From 83ab36702fa2cb504f942a847e1f8544deb080ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Sat, 1 Jun 2024 20:08:01 +0300
Subject: [PATCH 3/6] Small fixes

---
 src/GraphBLAS-sharp.Backend/Common/ClArray.fs      | 2 +-
 src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
index d48863ae..bc324850 100644
--- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
@@ -362,7 +362,7 @@ module ClArray =
 
                 let gid = ndRange.GlobalID0
 
-                if gid < length then
+                if gid < length && not result.Value then
                     let isExist = (%predicate) vector.[gid]
 
                     if isExist then result.Value <- true @>
diff --git a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
index 724de235..c12538b2 100644
--- a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
@@ -257,6 +257,7 @@ module Bitonic =
                 int (clContext.ClDevice.LocalMemSize)
                 / (sizeof<int> + sizeof<'a>)
             )
+            / 2
 
         let maxThreadsPerBlock =
             min (clContext.ClDevice.MaxWorkGroupSize) (localSize / 2)
@@ -476,4 +477,4 @@ module Bitonic =
 
                 kernelGlobal.KernelFunc ndRangeGlobal rows values values.Length (localSize * 2)
 
-                queue.RunKernel(kernelGlobal)
\ No newline at end of file
+                queue.RunKernel(kernelGlobal)

From fc0acd18d0cbba20344be9c1b28be15cc56e98b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Sat, 1 Jun 2024 20:14:06 +0300
Subject: [PATCH 4/6] Small fixes 2

---
 src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
index c12538b2..bf3572b7 100644
--- a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
@@ -10,8 +10,9 @@ module Bitonic =
         let localSize =
             Common.Utils.floorToPower2 (
                 int (clContext.ClDevice.LocalMemSize)
-                / (sizeof<uint64> + sizeof<'a>)
+                / (sizeof<int> + sizeof<'a>)
             )
+            / 2
 
         let maxThreadsPerBlock =
             min (clContext.ClDevice.MaxWorkGroupSize) (localSize / 2)

From 050df59fecc500a1de5c8115def8dc9c6e310393 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Sun, 2 Jun 2024 19:31:41 +0300
Subject: [PATCH 5/6] toSparse unsorted and tests

---
 src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs |  3 +-
 .../Vector/Dense/Vector.fs                    | 56 +++++++++++++++++++
 src/GraphBLAS-sharp.Backend/Vector/Vector.fs  | 20 +++++++
 .../Backend/Vector/Convert.fs                 | 34 +++++++++--
 4 files changed, 107 insertions(+), 6 deletions(-)

diff --git a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs
index f5387bb0..2b9a3a83 100644
--- a/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs
+++ b/src/GraphBLAS-sharp.Backend/Algorithms/BFS.fs
@@ -148,7 +148,8 @@ module internal BFS =
         let fillSubVectorInPlace =
             Vector.assignByMaskInPlace (Mask.assign) clContext workGroupSize
 
-        let toSparse = Vector.toSparse clContext workGroupSize
+        let toSparse =
+            Vector.toSparseUnsorted clContext workGroupSize
 
         let toDense = Vector.toDense clContext workGroupSize
 
diff --git a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs
index 36dc6835..6d910afd 100644
--- a/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs
+++ b/src/GraphBLAS-sharp.Backend/Vector/Dense/Vector.fs
@@ -140,7 +140,63 @@ module Vector =
 
             valueCell.Free()
 
+    // TODO: toSparseUnsorted + bitonic probably would work faster
     let toSparse<'a when 'a: struct> (clContext: ClContext) workGroupSize =
+        let scatterValues =
+            Common.Scatter.lastOccurrence clContext workGroupSize
+
+        let scatterIndices =
+            Common.Scatter.lastOccurrence clContext workGroupSize
+
+        let getBitmap =
+            Map.map (Map.option 1 0) clContext workGroupSize
+
+        let prefixSum =
+            Common.PrefixSum.standardExcludeInPlace clContext workGroupSize
+
+        let allIndices =
+            ClArray.init Map.id clContext workGroupSize
+
+        let allValues =
+            Map.map (Map.optionToValueOrZero Unchecked.defaultof<'a>) clContext workGroupSize
+
+        fun (processor: RawCommandQueue) allocationMode (vector: ClArray<'a option>) ->
+
+            let positions = getBitmap processor DeviceOnly vector
+
+            let resultLength =
+                (prefixSum processor positions)
+                    .ToHostAndFree(processor)
+
+            // compute result indices
+            let resultIndices =
+                clContext.CreateClArrayWithSpecificAllocationMode<int>(allocationMode, resultLength)
+
+            let allIndices =
+                allIndices processor DeviceOnly vector.Length
+
+            scatterIndices processor positions allIndices resultIndices
+
+            allIndices.Free()
+
+            // compute result values
+            let resultValues =
+                clContext.CreateClArrayWithSpecificAllocationMode<'a>(allocationMode, resultLength)
+
+            let allValues = allValues processor DeviceOnly vector
+
+            scatterValues processor positions allValues resultValues
+
+            allValues.Free()
+
+            positions.Free()
+
+            { Context = clContext
+              Indices = resultIndices
+              Values = resultValues
+              Size = vector.Length }
+
+    let toSparseUnsorted<'a when 'a: struct> (clContext: ClContext) workGroupSize =
 
         let kernel =
             <@ fun (ndRange: Range1D) (inputLength: int) (inputValues: ClArray<'a option>) (resultSize: ClCell<int>) (resultIndices: ClArray<int>) (resultValues: ClArray<'a>) ->
diff --git a/src/GraphBLAS-sharp.Backend/Vector/Vector.fs b/src/GraphBLAS-sharp.Backend/Vector/Vector.fs
index 0d10dd08..1c9f05c4 100644
--- a/src/GraphBLAS-sharp.Backend/Vector/Vector.fs
+++ b/src/GraphBLAS-sharp.Backend/Vector/Vector.fs
@@ -129,6 +129,26 @@ module Vector =
                 <| toSparse processor allocationMode vector
             | ClVector.Sparse _ -> copy processor allocationMode vector
 
+    /// <summary>
+    /// Sparsifies the given vector if it is in a dense format.
+    /// If the given vector is already sparse, copies it.
+    /// Works faster than regular version, but indices of the sparse vector are unsorted.
+    /// </summary>
+    /// <param name="clContext">OpenCL context.</param>
+    /// <param name="workGroupSize">Should be a power of 2 and greater than 1.</param>
+    let toSparseUnsorted (clContext: ClContext) workGroupSize =
+        let toSparse =
+            Dense.Vector.toSparseUnsorted clContext workGroupSize
+
+        let copy = copy clContext workGroupSize
+
+        fun (processor: RawCommandQueue) allocationMode (vector: ClVector<'a>) ->
+            match vector with
+            | ClVector.Dense vector ->
+                ClVector.Sparse
+                <| toSparse processor allocationMode vector
+            | ClVector.Sparse _ -> copy processor allocationMode vector
+
     /// <summary>
     /// Densifies the given vector if it is in a sparse format.
     /// If the given vector is already dense, copies it.
diff --git a/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs b/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs
index a755be58..a53691e1 100644
--- a/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs
+++ b/tests/GraphBLAS-sharp.Tests/Backend/Vector/Convert.fs
@@ -22,6 +22,7 @@ let wgSize = Constants.Common.defaultWorkGroupSize
 let makeTest
     formatFrom
     (convertFun: RawCommandQueue -> AllocationFlag -> ClVector<'a> -> ClVector<'a>)
+    (convertFunUnsorted: option<RawCommandQueue -> AllocationFlag -> ClVector<'a> -> ClVector<'a>>)
     isZero
     case
     (array: 'a [])
@@ -37,7 +38,7 @@ let makeTest
 
         let actual =
             let clVector = vector.ToDevice context
-            let convertedVector = convertFun q HostInterop clVector
+            let convertedVector = convertFun q DeviceOnly clVector
 
             let res = convertedVector.ToHost q
 
@@ -56,6 +57,27 @@ let makeTest
 
         Expect.equal actual expected "Vectors must be the same"
 
+        match convertFunUnsorted with
+        | None -> ()
+        | Some convertFunUnsorted ->
+            let clVector = vector.ToDevice context
+            let convertedVector = convertFunUnsorted q DeviceOnly clVector
+
+            let res = convertedVector.ToHost q
+
+            match res, expected with
+            | Vector.Sparse res, Vector.Sparse expected ->
+                let iv = Array.zip res.Indices res.Values
+                let resSorted = Array.sortBy (fun (i, v) -> i) iv
+                let indices, values = Array.unzip resSorted
+                Expect.equal indices expected.Indices "Indices must be the same"
+                Expect.equal values expected.Values "Values must be the same"
+                Expect.equal res.Size expected.Size "Size must be the same"
+            | _ -> ()
+
+            clVector.Dispose()
+            convertedVector.Dispose()
+
 let testFixtures case =
     let getCorrectnessTestName datatype formatFrom =
         sprintf $"Correctness on %s{datatype}, %A{formatFrom} -> %A{case.Format}"
@@ -68,19 +90,21 @@ let testFixtures case =
     match case.Format with
     | Sparse ->
         [ let convertFun = Vector.toSparse context wgSize
+          let convertFunUnsorted = Vector.toSparseUnsorted context wgSize
 
           Utils.listOfUnionCases<VectorFormat>
           |> List.map
               (fun formatFrom ->
-                  makeTest formatFrom convertFun ((=) 0) case
+                  makeTest formatFrom convertFun (Some convertFunUnsorted) ((=) 0) case
                   |> testPropertyWithConfig config (getCorrectnessTestName "int" formatFrom))
 
           let convertFun = Vector.toSparse context wgSize
+          let convertFunUnsorted = Vector.toSparseUnsorted context wgSize
 
           Utils.listOfUnionCases<VectorFormat>
           |> List.map
               (fun formatFrom ->
-                  makeTest formatFrom convertFun ((=) false) case
+                  makeTest formatFrom convertFun (Some convertFunUnsorted) ((=) false) case
                   |> testPropertyWithConfig config (getCorrectnessTestName "bool" formatFrom)) ]
         |> List.concat
     | Dense ->
@@ -89,7 +113,7 @@ let testFixtures case =
           Utils.listOfUnionCases<VectorFormat>
           |> List.map
               (fun formatFrom ->
-                  makeTest formatFrom convertFun ((=) 0) case
+                  makeTest formatFrom convertFun None ((=) 0) case
                   |> testPropertyWithConfig config (getCorrectnessTestName "int" formatFrom))
 
           let convertFun = Vector.toDense context wgSize
@@ -97,7 +121,7 @@ let testFixtures case =
           Utils.listOfUnionCases<VectorFormat>
           |> List.map
               (fun formatFrom ->
-                  makeTest formatFrom convertFun ((=) false) case
+                  makeTest formatFrom convertFun None ((=) false) case
                   |> testPropertyWithConfig config (getCorrectnessTestName "bool" formatFrom)) ]
         |> List.concat
 

From 12e733f3e456891b5910e9d18376d4768c04daab Mon Sep 17 00:00:00 2001
From: Kirill <71129570+kirillgarbar@users.noreply.github.com>
Date: Sun, 2 Jun 2024 20:31:36 +0300
Subject: [PATCH 6/6] Fix bitonic localSize

---
 src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
index bf3572b7..11b6b62e 100644
--- a/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/Sort/Bitonic.fs
@@ -10,7 +10,7 @@ module Bitonic =
         let localSize =
             Common.Utils.floorToPower2 (
                 int (clContext.ClDevice.LocalMemSize)
-                / (sizeof<int> + sizeof<'a>)
+                / (sizeof<uint64> + sizeof<'a>)
             )
             / 2