Merge pull request #367 from DrTimothyAldenDavis/dev2

9.4.2
DrTimothyAldenDavis · Nov 20, 2024 · bde76fb · bde76fb
2 parents e3f309c + 74a5370
commit bde76fb
Show file tree

Hide file tree

Showing 3,244 changed files with 76,990 additions and 81,224 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -42,7 +42,7 @@ endif ( )
 
 # CUDA is under development for now, and not deployed in production:
   set ( GRAPHBLAS_USE_CUDA OFF )
-# set ( GRAPHBLAS_USE_CUDA ON )     # FIXME: use this for CUDA development
+# set ( GRAPHBLAS_USE_CUDA ON )     # use this for CUDA development only
 
 include ( SuiteSparsePolicy )
 
@@ -210,17 +210,25 @@ configure_file ( "Config/README.md.in"
 include_directories ( ${PROJECT_SOURCE_DIR} Source Include Config
     xxHash lz4 zstd zstd/zstd_subset JITpackage Demo/Include rmm_wrap
     # include all Source/* folders that have include/ or template/ subfolders:
+    Source/add
     Source/apply
     Source/assign
     Source/builder
     Source/builtin
     Source/callback
     Source/concat
     Source/convert
+    Source/cumsum
+    Source/emult
     Source/ewise
+    Source/extract
     Source/hyper
+    Source/ij
     Source/jit_kernels
+    Source/kronecker
+    Source/mask
     Source/math
+    Source/matrix
     Source/memory
     Source/monoid
     Source/mxm
@@ -231,6 +239,7 @@ include_directories ( ${PROJECT_SOURCE_DIR} Source Include Config
     Source/select
     Source/split
     Source/slice
+    Source/sort
     Source/transpose
     Source/type
     Source/wait

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,18 @@
+# Contributing to SuiteSparse:GraphBLAS
+
+To add an issue for a bug report (gasp!) or a feature request,
+you can use the issue tracker on github.com, at
+[`https://github.com/DrTimothyAldenDavis/GraphBLAS/issues`]
+(https://github.com/DrTimothyAldenDavis/GraphBLAS/issues).
+
+To contribute code, you can submit a pull request.  To do so,
+you must first agree to the Contributor License
+in the [`Contributor_License`](Contributor_License) folder.
+Sign and date it the PDF, and email it to me at
+[email protected].  Pull requests will only be
+included into SuiteSparse:GraphBLAS after I receive your email with
+the signed PDF.
+
+Do not submit a pull request to the default branch.
+Instead, use the dev2 branch.
+
diff --git a/CONTRIBUTOR-LICENSE.txt b/CONTRIBUTOR-LICENSE.txt
diff --git a/CUDA/GB_cuda_AxB_dot3.cpp b/CUDA/GB_cuda_AxB_dot3.cpp
@@ -52,7 +52,7 @@ GrB_Info GB_cuda_AxB_dot3           // C<M> = A'*B using dot product method
 
     // FIXME: pass in a stream instead, or checkout a stream
     CUDA_OK (cudaStreamCreate (&stream)) ;
-    GpuTimer kernel_timer; 
+    GpuTimer kernel_timer;
 
     //--------------------------------------------------------------------------
     // check inputs
@@ -165,7 +165,7 @@ GrB_Info GB_cuda_AxB_dot3           // C<M> = A'*B using dot product method
     GB_OK (GB_new_bix (&C, // sparse or hyper (from M), existing header
         ctype, cvlen, cvdim, GB_Ap_malloc, true,
         M_sparsity, false, M->hyper_switch, cnvec,
-        cnz+1,  // add one to cnz for GB_cumsum of Cwork 
+        cnz+1,  // add one to cnz for cumsum of Cwork
         true, C_iso)) ;
 
     //--------------------------------------------------------------------------
@@ -219,7 +219,7 @@ GrB_Info GB_cuda_AxB_dot3           // C<M> = A'*B using dot product method
 
     // M might be very very sparse.  A(:,i) is not needed if M(:,i) is empty.
     // Likewise, B(:,j) is not needed if M(:,j) is empty.  For now, try this
-    // heuristic:  if M is hypersparse, then do not prefetch A->b or A->x.  
+    // heuristic:  if M is hypersparse, then do not prefetch A->b or A->x.
 
     int prefetch_b = (M_is_hyper) ? 0 : GB_PREFETCH_B ;
     int prefetch_x = (M_is_hyper) ? 0 : GB_PREFETCH_X ;
@@ -252,6 +252,6 @@ GrB_Info GB_cuda_AxB_dot3           // C<M> = A'*B using dot product method
     //--------------------------------------------------------------------------
 
     GB_FREE_WORKSPACE ;
-    return GrB_SUCCESS; 
+    return GrB_SUCCESS;
 }
 
diff --git a/CUDA/GB_cuda_AxB_dot3_jit.cpp b/CUDA/GB_cuda_AxB_dot3_jit.cpp
@@ -60,6 +60,6 @@ GrB_Info GB_cuda_AxB_dot3_jit
 
     GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
     return (GB_jit_kernel (C, M, A, B, stream, device, number_of_sms,
-        &GB_callback)) ;
+        &GB_callback, semiring->multiply->theta)) ;
 }
 
diff --git a/CUDA/GB_cuda_apply_bind1st_jit.cpp b/CUDA/GB_cuda_apply_bind1st_jit.cpp
@@ -28,8 +28,9 @@ GrB_Info GB_cuda_apply_bind1st_jit
     GB_jit_encoding encoding ;
     char *suffix ;
     uint64_t hash = GB_encodify_ewise (&encoding, &suffix,
-        GB_JIT_CUDA_KERNEL_APPLYBIND1, false, false, false, GxB_FULL, ctype, 
-        NULL, false, false, op, false, NULL, A) ;
+        GB_JIT_CUDA_KERNEL_APPLYBIND1, false,
+        false, false, GxB_FULL, ctype, NULL, false, false,
+        op, false, false, NULL, A) ;
 
     //--------------------------------------------------------------------------
     // get the kernel function pointer, loading or compiling it if needed

diff --git a/CUDA/GB_cuda_apply_bind2nd_jit.cpp b/CUDA/GB_cuda_apply_bind2nd_jit.cpp
@@ -28,8 +28,9 @@ GrB_Info GB_cuda_apply_bind2nd_jit
     GB_jit_encoding encoding ;
     char *suffix ;
     uint64_t hash = GB_encodify_ewise (&encoding, &suffix,
-        GB_JIT_CUDA_KERNEL_APPLYBIND2, false, false, false, GxB_FULL, ctype, 
-        NULL, false, false, op, false, A, NULL) ;
+        GB_JIT_CUDA_KERNEL_APPLYBIND2, false,
+        false, false, GxB_FULL, ctype, NULL, false, false,
+        op, false, false, A, NULL) ;
 
     //--------------------------------------------------------------------------
     // get the kernel function pointer, loading or compiling it if needed

diff --git a/CUDA/GB_cuda_apply_unop_jit.cpp b/CUDA/GB_cuda_apply_unop_jit.cpp
@@ -29,7 +29,8 @@ GrB_Info GB_cuda_apply_unop_jit
     GB_jit_encoding encoding ;
     char *suffix ;
     uint64_t hash = GB_encodify_apply (&encoding, &suffix,
-        GB_JIT_CUDA_KERNEL_APPLYUNOP, GxB_FULL, false, ctype, op, flipij, A) ;
+        GB_JIT_CUDA_KERNEL_APPLYUNOP, GxB_FULL, false, ctype, op, flipij,
+        GB_sparsity (A), true, A->type, A->iso, A->nzombies) ;
 
     //--------------------------------------------------------------------------
     // get the kernel function pointer, loading or compiling it if needed
@@ -48,4 +49,4 @@ GrB_Info GB_cuda_apply_unop_jit
 
     GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
     return (GB_jit_kernel (Cx, A, ythunk, stream, gridsz, blocksz)) ;
-}
+}
diff --git a/CUDA/GB_cuda_colscale_jit.cpp b/CUDA/GB_cuda_colscale_jit.cpp
@@ -30,7 +30,7 @@ GrB_Info GB_cuda_colscale_jit
     uint64_t hash = GB_encodify_ewise (&encoding, &suffix,
         GB_JIT_CUDA_KERNEL_COLSCALE, false,
         false, false, GB_sparsity (C), C->type, NULL, false, false,
-        binaryop, flipxy, A, D) ;
+        binaryop, false, flipxy, A, D) ;
 
     //--------------------------------------------------------------------------
     // get the kernel function pointer, loading or compiling it if needed

diff --git a/CUDA/GB_cuda_get_device_count.cu b/CUDA/GB_cuda_get_device_count.cu
@@ -15,7 +15,10 @@ bool GB_cuda_get_device_count   // true if OK, false if failure
     int *gpu_count              // return # of GPUs in the system
 )
 {
+    (*gpu_count) = 0 ;
     cudaError_t err = cudaGetDeviceCount (gpu_count) ;
+    printf ("GB_cuda_get_device_count: %d, cudaError_t: %d\n",
+        *gpu_count, err) ;
     return (err == cudaSuccess) ;
 }
 
diff --git a/CUDA/GB_cuda_init.c b/CUDA/GB_cuda_init.c
@@ -19,13 +19,18 @@ GrB_Info GB_cuda_init (void)
 {
 
     // get the GPU properties
-    if (!GB_Global_gpu_count_set (true)) return (GrB_PANIC) ;
+    if (!GB_Global_gpu_count_set (true))
+    {
+        printf ("GB_cuda_init line %d\n", __LINE__) ;
+        return (GrB_PANIC) ;
+    }
     int gpu_count = GB_Global_gpu_count_get ( ) ;
     for (int device = 0 ; device < 1 ; device++) // TODO for GPU: gpu_count
     {
         // query the GPU and then warm it up
         if (!GB_Global_gpu_device_properties_get (device))
         {
+            printf ("GB_cuda_init line %d\n", __LINE__) ;
             return (GrB_PANIC) ;
         }
     }
@@ -41,14 +46,15 @@ GrB_Info GB_cuda_init (void)
             // of the work.  Alternatively, move GB_cuda_init here (if so,
             // ensure that it doesn't depend on any other initializations
             // below).
-            256 * 1000000L, 256 * 100000000L, 1) ;
+            256 * 1000000L, 1024 * 100000000L, 1) ; // FIXME: ask the GPU(s)
     }
 
     // warm up the GPUs
     for (int device = 0 ; device < 1 ; device++) // TODO for GPU: gpu_count
     {
         if (!GB_cuda_warmup (device))
         {
+            printf ("GB_cuda_init line %d\n", __LINE__) ;
             return (GrB_PANIC) ;
         }
     }

diff --git a/CUDA/GB_cuda_reduce_to_scalar.cpp b/CUDA/GB_cuda_reduce_to_scalar.cpp
@@ -76,13 +76,11 @@ GrB_Info GB_cuda_reduce_to_scalar
     int gridsz = (int) raw_gridsz ;
 
     // FIXME: GB_enumify_reduce is called twice: here (to get has_cheeseburger)
-    // and in GB_cuda_reduce_to_scalar_jit.  Can we just call it once?  One
-    // solution: The code from here to the call to GB_cuda_reduce_to_scalar_jit
-    // could be added to the GB_cuda_reduce_to_scalar_jit function itself.
+    // and in GB_cuda_reduce_to_scalar_jit.  Can we just call it once?
 
     uint64_t rcode ;
     GB_enumify_reduce (&rcode, monoid, A) ;
-    bool has_cheeseburger = GB_RSHIFT (rcode, 27, 1) ;
+    bool has_cheeseburger = GB_RSHIFT (rcode, 16, 1) ;
     GBURBLE ("has_cheeseburger %d\n", has_cheeseburger) ;
 
     // determine the kind of reduction: partial (to &V), or complete