diff --git a/cuda/pom.xml b/cuda/pom.xml
new file mode 100644
index 0000000000..f25b562419
--- /dev/null
+++ b/cuda/pom.xml
@@ -0,0 +1,251 @@
+
+
+
+
+
+ 4.0.0
+
+
+ org.apache.mahout
+ mahout
+ 0.13.0-SNAPSHOT
+ ../pom.xml
+
+
+
+ 0.8.0
+
+
+ mahout-native-cuda_${scala.compat.version}
+
+ Mahout Native CUDA Bindings
+ Native Structures and interfaces to be used from Mahout math-scala.
+
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ test-jar
+
+ package
+
+
+
+
+
+ maven-javadoc-plugin
+
+
+
+ maven-source-plugin
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ add-scala-sources
+ initialize
+
+ add-source
+
+
+
+ scala-compile
+ process-resources
+
+ compile
+
+
+
+ scala-test-compile
+ process-test-resources
+
+ testCompile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.scalatest
+ scalatest-maven-plugin
+
+
+ test
+
+ test
+
+
+
+
+ -Xmx4g
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 2.3
+
+
+
+ properties
+
+
+
+
+
+
+
+ maven-antrun-plugin
+ 1.4
+
+
+ copy
+ package
+
+
+
+
+
+
+ run
+
+
+
+
+
+
+
+
+
+
+
+
+ ${project.groupId}
+ mahout-math-scala_${scala.compat.version}
+
+
+
+
+ log4j
+ log4j
+
+
+
+
+ org.scalatest
+ scalatest_${scala.compat.version}
+
+
+
+ org.bytedeco
+ javacpp
+ 1.2.4
+
+
+
+ org.jcuda
+ jcuda
+ ${jcuda.jcudaVersion}
+
+
+
+ org.jcuda
+ jcusparse
+ ${jcuda.jcudaVersion}
+
+
+
+
+
+
+
+ mahout-release
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ generate-scaladoc
+
+ doc
+
+
+
+ attach-scaladoc-jar
+
+ doc-jar
+
+
+
+
+
+
+
+
+ travis
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+
+ -Xmx4g
+
+
+
+
+ org.apache.maven.plugins
+ maven-failsafe-plugin
+
+
+ -Xmx4g
+
+
+
+
+
+
+
+
diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/CompressedMatrix.scala b/cuda/src/main/scala/org/apache/mahout/cuda/CompressedMatrix.scala
new file mode 100644
index 0000000000..2aa27f1422
--- /dev/null
+++ b/cuda/src/main/scala/org/apache/mahout/cuda/CompressedMatrix.scala
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+
+package org.apache.mahout.cuda
+
+import java.nio._
+
+import jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO
+import jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL
+import jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE
+import jcuda.jcusparse.JCusparse._
+import jcuda.jcusparse._
+import jcuda.runtime.JCuda._
+import jcuda.runtime.cudaMemcpyKind._
+
+final class CompressedMatrix {
+
+ var row_ptr = new jcuda.Pointer()
+ var col_ind = new jcuda.Pointer()
+ var vals = new jcuda.Pointer()
+
+ var trans = CUSPARSE_OPERATION_NON_TRANSPOSE
+ var descr = new cusparseMatDescr()
+
+ var nrows = 0
+ var ncols = 0
+ var nonz = 0
+
+ def this(ctx: Context, nrow: Int, ncol: Int, nonzeros: Int = 0) {
+ this()
+
+ nrows = nrow
+ ncols = ncol
+ cudaMalloc(row_ptr, (nrow+1)*jcuda.Sizeof.INT)
+
+ nonz = nonzeros
+ if (nonzeros > 0) {
+ cudaMalloc(col_ind, nonzeros*jcuda.Sizeof.INT)
+ cudaMalloc(vals, nonzeros*jcuda.Sizeof.DOUBLE)
+ }
+
+ // create and setup matrix descriptor
+ cusparseCreateMatDescr(descr)
+ cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
+ cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)
+ }
+
+ def set(rowJumper: Array[Int],
+ colIndices: Array[Int],
+ elements: Array[Double],
+ nrow: Int,
+ ncol: Int,
+ nonzeros: Int) {
+ cudaMemcpy(row_ptr, jcuda.Pointer.to(rowJumper), (nrow+1)*jcuda.Sizeof.INT, cudaMemcpyHostToDevice)
+ cudaMemcpy(col_ind, jcuda.Pointer.to(colIndices), (nonzeros)*jcuda.Sizeof.INT, cudaMemcpyHostToDevice)
+ cudaMemcpy(vals, jcuda.Pointer.to(elements), (nonzeros)*jcuda.Sizeof.DOUBLE, cudaMemcpyHostToDevice)
+ }
+
+ def close() {
+ cudaFree(row_ptr)
+ if (nonz > 0) {
+ cudaFree(col_ind)
+ cudaFree(vals)
+ }
+ }
+}
+
diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala b/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala
new file mode 100644
index 0000000000..67001bb2a8
--- /dev/null
+++ b/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+
+package org.apache.mahout.cuda
+
+import jcuda.jcusparse.JCusparse._
+import jcuda.jcusparse._
+
+final class Context {
+
+ var handle: jcuda.jcusparse.cusparseHandle = new cusparseHandle()
+
+ // Initialize JCusparse library
+ cusparseCreate(handle)
+}
+
diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/GPUMMul.scala b/cuda/src/main/scala/org/apache/mahout/cuda/GPUMMul.scala
new file mode 100644
index 0000000000..02d02ea8cd
--- /dev/null
+++ b/cuda/src/main/scala/org/apache/mahout/cuda/GPUMMul.scala
@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+
+package org.apache.mahout.cuda
+
+import org.apache.mahout.logging._
+import org.apache.mahout.math
+import org.apache.mahout.math._
+import org.apache.mahout.math.backend.incore.MMulSolver
+import org.apache.mahout.math.flavor.{BackEnum, TraversingStructureEnum}
+import org.apache.mahout.math.function.Functions
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.scalabindings._
+
+import scala.collection.JavaConversions._
+
+import jcuda._
+import jcuda.jcusparse._
+import jcuda.runtime.JCuda
+
+object GPUMMul extends MMBinaryFunc {
+
+ private implicit val log = getLog(GPUMMul.getClass)
+
+ override def apply(a: Matrix, b: Matrix, r: Option[Matrix]): Matrix = {
+
+ require(a.ncol == b.nrow, "Incompatible matrix sizes in matrix multiplication.")
+
+ val (af, bf) = (a.getFlavor, b.getFlavor)
+ val backs = (af.getBacking, bf.getBacking)
+ val sd = (af.getStructure, math.scalabindings.densityAnalysis(a), bf.getStructure, densityAnalysis(b))
+
+
+ try {
+
+ val alg: MMulAlg = backs match {
+
+ // Both operands are jvm memory backs.
+ case (BackEnum.JVMMEM, BackEnum.JVMMEM) ⇒
+
+ sd match {
+
+ // Multiplication cases by a diagonal matrix.
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.COLWISE, _)
+ if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagCW
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSECOLWISE, _)
+ if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagCW
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.ROWWISE, _)
+ if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagRW
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSEROWWISE, _)
+ if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagRW
+
+ case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.VECTORBACKED, _)
+ if b.isInstanceOf[DiagonalMatrix] ⇒ jvmCWDiag
+ case (TraversingStructureEnum.SPARSECOLWISE, _, TraversingStructureEnum.VECTORBACKED, _)
+ if b.isInstanceOf[DiagonalMatrix] ⇒ jvmCWDiag
+ case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.VECTORBACKED, _)
+ if b.isInstanceOf[DiagonalMatrix] ⇒ jvmRWDiag
+ case (TraversingStructureEnum.SPARSEROWWISE, _, TraversingStructureEnum.VECTORBACKED, _)
+ if b.isInstanceOf[DiagonalMatrix] ⇒ jvmRWDiag
+
+ // Dense-dense cases
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if a eq b.t ⇒ gpuDRWAAt
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if a.t eq b ⇒ gpuDRWAAt
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ gpuRWCW
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmRWRW
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ jvmCWCW
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if a eq b.t ⇒ jvmDCWAAt
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if a.t eq b ⇒ jvmDCWAAt
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmCWRW
+
+ // Sparse row matrix x sparse row matrix (array of vectors)
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ gpuSparseRWRW
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseRWCW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ jvmSparseCWRW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseCWCW
+
+ // Sparse matrix x sparse matrix (hashtable of vectors)
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
+ gpuSparseRowRWRW
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
+ jvmSparseRowRWCW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
+ jvmSparseRowCWRW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
+ jvmSparseRowCWCW
+
+ // Sparse matrix x non-like
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ gpuSparseRowRWRW
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRowRWCW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRowCWRW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW
+ case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ gpuSparseRWRW
+ case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRWCW
+ case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ jvmSparseCWRW
+ case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRowCWCW
+
+ // Everything else including at least one sparse LHS or RHS argument
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ gpuSparseRWRW
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRWCW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseCWRW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW2flips
+
+ // Sparse methods are only effective if the first argument is sparse, so we need to do a swap.
+ case (_, _, _, false) ⇒ (a, b, r) ⇒ apply(b.t, a.t, r.map {
+ _.t
+ }).t
+
+ // Default jvm-jvm case.
+ // for some reason a SrarseRowMatrix DRM %*% SrarseRowMatrix DRM was dumping off to here
+ case _ ⇒ gpuRWCW
+ }
+ }
+
+ alg(a, b, r)
+ } catch {
+ // TODO FASTHACK: Revert to JVM if there is an exception.
+ // E.g., java.lang.nullPointerException if more openCL contexts
+ // have been created than number of GPU cards.
+ // Better option wuold be to fall back to OpenCL first.
+ case ex: Exception =>
+ log.info(ex.getMessage + "falling back to JVM MMUL")
+ ex.printStackTrace
+ var res = MMul(a, b, r)
+ println("zsum = " + res.zSum().toString())
+ return res
+ }
+ }
+
+ type MMulAlg = MMBinaryFunc
+
+ @inline
+ private def gpuRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using gpuRWCW method")
+
+ val hasElementsA = a.zSum() > 0.0
+ val hasElementsB = b.zSum() > 0.0
+
+ // A has a sparse matrix structure of unknown size. We do not want to
+ // simply convert it to a Dense Matrix which may result in an OOM error.
+
+ // If it is empty use JVM MMul, since we can not convert it to a VCL CSR Matrix.
+ if (!hasElementsA) {
+ log.warn("Matrix a has zero elements can not convert to CSR")
+ return MMul(a, b, r)
+ }
+
+ // CSR matrices are efficient up to 50% non-zero
+ if(b.getFlavor.isDense) {
+ log.warn("Dense matrices are not supported in CUDA backend, using JVM instead")
+ return MMul(a, b, r)
+ } else {
+ // Fall back to JVM based MMul if either matrix is sparse and empty
+ if (!hasElementsA || !hasElementsB) {
+ log.warn("Matrix a or b has zero elements can not convert to CSR")
+ return MMul(a, b, r)
+ }
+
+ var ms = System.currentTimeMillis()
+
+ val ctx = new Context()
+ val cudaA = toCudaCmpMatrix(a, ctx)
+ val cudaB = toCudaCmpMatrix(b, ctx)
+ val cudaC = prod(cudaA, cudaB, ctx)
+ val mxC = fromCudaCmpMatrix(cudaC)
+
+ ms = System.currentTimeMillis() - ms
+ log.debug(s"CUDA multiplication time: $ms ms")
+ val gpuTrace = mxC.zSum()
+ log.debug(s"CUDA trace: $gpuTrace")
+
+ // uncomment code below to verify results against JVM
+/*
+ ms = System.currentTimeMillis()
+ val jvmTrace = MMul(a, b, r).zSum()
+ ms = System.currentTimeMillis() - ms
+ log.debug(s"JVM multiplication time: $ms ms")
+ log.debug(s"JVM trace: $jvmTrace")
+*/
+ cudaA.close()
+ cudaB.close()
+ cudaC.close()
+
+ mxC
+ }
+ }
+
+
+ @inline
+ private def jvmRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using jvmRWRW method")
+ // A bit hackish: currently, this relies a bit on the fact that like produces RW(?)
+ val bclone = b.like(b.ncol, b.nrow).t
+ for (brow ← b) bclone(brow.index(), ::) := brow
+
+ require(bclone.getFlavor.getStructure == TraversingStructureEnum.COLWISE || bclone.getFlavor.getStructure ==
+ TraversingStructureEnum.SPARSECOLWISE, "COL wise conversion assumption of RHS is wrong, do over this code.")
+
+ gpuRWCW(a, bclone, r)
+ }
+
+ private def jvmCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using jvmCWCW method")
+ jvmRWRW(b.t, a.t, r.map(_.t)).t
+ }
+
+ private def jvmCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using jvmCWRW method")
+ // This is a primary contender with Outer Prod sum algo.
+ // Here, we force-reorient both matrices and run RWCW.
+ // A bit hackish: currently, this relies a bit on the fact that clone always produces RW(?)
+ val aclone = a.cloned
+
+ require(aclone.getFlavor.getStructure == TraversingStructureEnum.ROWWISE || aclone.getFlavor.getStructure ==
+ TraversingStructureEnum.SPARSEROWWISE, "Row wise conversion assumption of RHS is wrong, do over this code.")
+
+ jvmRWRW(aclone, b, r)
+ }
+
+ // left is Sparse right is any
+ private def gpuSparseRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using gpuSparseRWRW method")
+ val mxR = r.getOrElse(b.like(a.nrow, b.ncol))
+
+
+ /* This is very close to the algorithm from SparseMatrix.times
+ for (arow ← a; ael ← arow.nonZeroes)
+ mxR(arow.index(), ::).assign(b(ael.index, ::), Functions.plusMult(ael))
+ mxR
+
+ Make sure that the matrix is not empty. VCL {{compressed_matrix}}s must
+ have nnz > 0
+ N.B. This method is horribly inefficent. However there is a difference between
+ getNumNonDefaultElements() and getNumNonZeroElements() which we do not always
+ have access to. We created MAHOUT-1882 for this.
+ */
+
+ val hasElementsA = a.zSum() > 0.0
+ val hasElementsB = b.zSum() > 0.0
+
+ // A has a sparse matrix structure of unknown size. We do not want to
+ // simply convert it to a Dense Matrix which may result in an OOM error.
+ // If it is empty use JVM MMul, since we can not convert it to a VCL CSR Matrix.
+ if (!hasElementsA) {
+ log.warn("Matrix a has zero elements can not convert to CSR")
+ return MMul(a, b, r)
+ }
+
+ // CSR matrices are efficient up to 50% non-zero
+ if(b.getFlavor.isDense) {
+ log.warn("Dense matrices are not supported in CUDA backend, using JVM instead")
+ return MMul(a, b, r)
+ } else {
+ // Fall back to JVM based MMul if either matrix is sparse and empty
+ if (!hasElementsA || !hasElementsB) {
+ log.warn("Matrix a or b has zero elements can not convert to CSR")
+ return MMul(a, b, r)
+ }
+
+ var ms = System.currentTimeMillis()
+
+ val ctx = new Context()
+ val cudaA = toCudaCmpMatrix(a, ctx)
+ val cudaB = toCudaCmpMatrix(b, ctx)
+ val cudaC = prod(cudaA, cudaB, ctx)
+ val mxC = fromCudaCmpMatrix(cudaC)
+
+ ms = System.currentTimeMillis() - ms
+ log.debug(s"CUDA multiplication time: $ms ms")
+ val gpuTrace = mxC.zSum()
+ log.debug(s"CUDA trace: $gpuTrace")
+
+ // uncomment code below to verify results against JVM
+/*
+ ms = System.currentTimeMillis()
+ val jvmTrace = MMul(a, b, r).zSum()
+ ms = System.currentTimeMillis() - ms
+ log.debug(s"JVM multiplication time: $ms ms")
+ log.debug(s"JVM trace: $jvmTrace")
+*/
+ cudaA.close()
+ cudaB.close()
+ cudaC.close()
+
+ mxC
+ }
+
+ }
+
+ // Sparse %*% dense
+ private def gpuSparseRowRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using gpuSparseRowRWRW method")
+ val hasElementsA = a.zSum() > 0
+
+ // A has a sparse matrix structure of unknown size. We do not want to
+ // simply convert it to a Dense Matrix which may result in an OOM error.
+ // If it is empty fall back to JVM MMul, since we can not convert it
+ // to a VCL CSR Matrix.
+ if (!hasElementsA) {
+ log.warn("Matrix a has zero elements can not convert to CSR")
+ return MMul(a, b, r)
+ }
+
+ log.warn("Dense matrices are not supported in CUDA backend")
+ return MMul(a, b, r)
+ }
+
+ private def jvmSparseRowCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ gpuSparseRowRWRW(b.t, a.t, r.map(_.t)).t
+
+ private def jvmSparseRowCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ gpuSparseRowRWRW(a cloned, b cloned, r)
+
+ private def jvmSparseRowRWCW(a: Matrix, b: Matrix, r: Option[Matrix]) =
+ gpuSparseRowRWRW(a, b cloned, r)
+
+
+ private def jvmSparseRowCWRW(a: Matrix, b: Matrix, r: Option[Matrix]) =
+ gpuSparseRowRWRW(a cloned, b, r)
+
+ private def jvmSparseRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ gpuSparseRWRW(a, b.cloned, r)
+
+ private def jvmSparseCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ gpuSparseRWRW(a cloned, b, r)
+
+ private def jvmSparseCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ gpuSparseRWRW(b.t, a.t, r.map(_.t)).t
+
+ private def jvmSparseCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ gpuSparseRWRW(a cloned, b cloned, r)
+
+ private def jvmDiagRW(diagm:Matrix, b:Matrix, r:Option[Matrix] = None):Matrix = {
+ log.info("Using jvmDiagRW method")
+ val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
+
+ for (del ← diagm.diagv.nonZeroes())
+ mxR(del.index, ::).assign(b(del.index, ::), Functions.plusMult(del))
+
+ mxR
+ }
+
+ private def jvmDiagCW(diagm: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using jvmDiagCW method")
+ val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
+ for (bcol ← b.t) mxR(::, bcol.index()) := bcol * diagm.diagv
+ mxR
+ }
+
+ private def jvmCWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
+ jvmDiagRW(diagm, a.t, r.map {_.t}).t
+
+ private def jvmRWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
+ jvmDiagCW(diagm, a.t, r.map {_.t}).t
+
+
+ /** Dense column-wise AA' */
+ private def jvmDCWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
+ // a.t must be equiv. to b. Cloning must rewrite to row-wise.
+ gpuDRWAAt(a.cloned,null,r)
+ }
+
+ /** Dense Row-wise AA' */
+ // we probably will not want to use this for the actual release unless A is cached already
+ // but adding for testing purposes.
+ private def gpuDRWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
+ log.warn("Dense matrices are not supported in CUDA backend, using JVM instead")
+ MMul(a, b, r)
+ }
+
+ private def jvmOuterProdSum(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ log.info("Using jvmOuterProdSum method")
+ // Need to check whether this is already laid out for outer product computation, which may be faster than
+ // reorienting both matrices.
+ val (m, n) = (a.nrow, b.ncol)
+
+ // Prefer col-wise result iff a is dense and b is sparse. In all other cases default to row-wise.
+ val preferColWiseR = a.getFlavor.isDense && !b.getFlavor.isDense
+
+ val mxR = r.getOrElse {
+ (a.getFlavor.isDense, preferColWiseR) match {
+ case (false, false) ⇒ b.like(m, n)
+ case (false, true) ⇒ b.like(n, m).t
+ case (true, false) ⇒ a.like(m, n)
+ case (true, true) ⇒ a.like(n, m).t
+ }
+ }
+
+ // Loop outer products
+ if (preferColWiseR) {
+ // B is sparse and A is not, so we need to iterate over b values and update R columns with +=
+ // one at a time.
+ for ((acol, brow) ← a.t.zip(b); bel ← brow.nonZeroes) mxR(::, bel.index()) += bel * acol
+ } else {
+ for ((acol, brow) ← a.t.zip(b); ael ← acol.nonZeroes()) mxR(ael.index(), ::) += ael * brow
+ }
+
+ mxR
+ }
+}
diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/package.scala b/cuda/src/main/scala/org/apache/mahout/cuda/package.scala
new file mode 100644
index 0000000000..93a3b25af2
--- /dev/null
+++ b/cuda/src/main/scala/org/apache/mahout/cuda/package.scala
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+
+package org.apache.mahout
+
+import java.nio._
+
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import scalabindings._
+import RLikeOps._
+import org.apache.mahout.math.backend.incore._
+import scala.collection.JavaConversions._
+
+import jcuda.runtime.JCuda._
+import jcuda.runtime.cudaMemcpyKind._
+import jcuda.jcusparse.JCusparse._
+
+package object cuda {
+
+ private implicit val log = getLog(GPUMMul.getClass)
+
+ /**
+ *
+ * @param mxSrc
+ * @param ctx
+ * @return
+ */
+ def toCudaCmpMatrix(mxSrc: Matrix, ctx: Context): CompressedMatrix = {
+ val (jumpers, colIdcs, els) = repackCSR(mxSrc)
+ val compMx = new CompressedMatrix(ctx, mxSrc.nrow, mxSrc.ncol, els.length)
+ compMx.set(jumpers, colIdcs, els, mxSrc.nrow, mxSrc.ncol, els.length)
+ compMx
+ }
+
+ private def repackCSR(mx: Matrix): (Array[Int], Array[Int], Array[Double]) = {
+ val nzCnt = mx.map(_.getNumNonZeroElements).sum
+ val jumpers = new Array[Int](mx.nrow + 1)
+ val colIdcs = new Array[Int](nzCnt + 0)
+ val els = new Array[Double](nzCnt)
+ var posIdx = 0
+
+ var sortCols = false
+
+ // Row-wise loop. Rows may not necessarily come in order. But we have to have them in-order.
+ for (irow ← 0 until mx.nrow) {
+
+ val row = mx(irow, ::)
+ jumpers(irow) = posIdx
+
+ // Remember row start index in case we need to restart conversion of this row if out-of-order
+ // column index is detected
+ val posIdxStart = posIdx
+
+ // Retry loop: normally we are done in one pass thru it unless we need to re-run it because
+ // out-of-order column was detected.
+ var done = false
+ while (!done) {
+
+ // Is the sorting mode on?
+ if (sortCols) {
+
+ // Sorting of column indices is on. So do it.
+ row.nonZeroes()
+ // Need to convert to a strict collection out of iterator
+ .map(el ⇒ el.index → el.get)
+ // Sorting requires Sequence api
+ .toSeq
+ // Sort by column index
+ .sortBy(_._1)
+ // Flush to the CSR buffers.
+ .foreach { case (index, v) ⇒
+ colIdcs(posIdx) = index
+ els(posIdx) = v
+ posIdx += 1
+ }
+
+ // Never need to retry if we are already in the sorting mode.
+ done = true
+
+ } else {
+
+ // Try to run unsorted conversion here, switch lazily to sorted if out-of-order column is
+ // detected.
+ var lastCol = 0
+ val nzIter = row.nonZeroes().iterator()
+ var abortNonSorted = false
+
+ while (nzIter.hasNext && !abortNonSorted) {
+
+ val el = nzIter.next()
+ val index = el.index
+
+ if (index < lastCol) {
+
+ // Out of order detected: abort inner loop, reset posIdx and retry with sorting on.
+ abortNonSorted = true
+ sortCols = true
+ posIdx = posIdxStart
+
+ } else {
+
+ // Still in-order: save element and column, continue.
+ els(posIdx) = el
+ colIdcs(posIdx) = index
+ posIdx += 1
+
+ // Remember last column seen.
+ lastCol = index
+ }
+ } // inner non-sorted
+
+ // Do we need to re-run this row with sorting?
+ done = !abortNonSorted
+
+ } // if (sortCols)
+
+ } // while (!done) retry loop
+
+ } // row-wise loop
+
+ // Make sure Mahout matrix did not cheat on non-zero estimate.
+ assert(posIdx == nzCnt)
+
+ jumpers(mx.nrow) = nzCnt
+
+ (jumpers, colIdcs, els)
+ }
+
+ def prod(a: CompressedMatrix, b: CompressedMatrix, ctx: Context): CompressedMatrix = {
+ var m = a.nrows
+ var n = b.ncols
+ var k = b.nrows
+
+ var c: CompressedMatrix = new CompressedMatrix(ctx, m, n)
+
+ // step 1: compute nnz count
+ var nnzC = new Array[Int](1)
+ nnzC(0) = 0
+ cusparseXcsrgemmNnz(ctx.handle, a.trans, b.trans, m, n, k,
+ a.descr, a.nonz, a.row_ptr, a.col_ind,
+ b.descr, b.nonz, b.row_ptr, b.col_ind,
+ c.descr, c.row_ptr, jcuda.Pointer.to(nnzC))
+ c.nonz = nnzC(0)
+ if (c.nonz == 0) {
+ var baseC = new Array[Int](1)
+ cudaMemcpy(jcuda.Pointer.to(nnzC), c.row_ptr.withByteOffset(m * jcuda.Sizeof.INT), jcuda.Sizeof.INT, cudaMemcpyDeviceToHost)
+ cudaMemcpy(jcuda.Pointer.to(baseC), c.row_ptr, jcuda.Sizeof.INT, cudaMemcpyDeviceToHost)
+ c.nonz = nnzC(0) - baseC(0)
+ }
+
+ // step 2: allocate and compute matrix product
+ cudaMalloc(c.col_ind, jcuda.Sizeof.INT * c.nonz);
+ cudaMalloc(c.vals, jcuda.Sizeof.DOUBLE * c.nonz);
+ cusparseDcsrgemm(ctx.handle, a.trans, b.trans, m, n, k,
+ a.descr, a.nonz,
+ a.vals, a.row_ptr, a.col_ind,
+ b.descr, b.nonz,
+ b.vals, b.row_ptr, b.col_ind,
+ c.descr,
+ c.vals, c.row_ptr, c.col_ind);
+ c
+ }
+
+ def fromCudaCmpMatrix(src: CompressedMatrix): Matrix = {
+ val m = src.nrows
+ val n = src.ncols
+ val NNz = src.nonz
+
+ log.debug("m=" + m.toString() + ", n=" + n.toString() + ", nnz=" + NNz.toString())
+
+ val row_ptr = new Array[Int](m + 1)
+ val col_idx = new Array[Int](NNz)
+ val values = new Array[Double](NNz)
+
+ cudaMemcpy(jcuda.Pointer.to(row_ptr), src.row_ptr, (m+1)*jcuda.Sizeof.INT, cudaMemcpyDeviceToHost)
+ cudaMemcpy(jcuda.Pointer.to(col_idx), src.col_ind, (NNz)*jcuda.Sizeof.INT, cudaMemcpyDeviceToHost)
+ cudaMemcpy(jcuda.Pointer.to(values), src.vals, (NNz)*jcuda.Sizeof.DOUBLE, cudaMemcpyDeviceToHost)
+
+ val srMx = new SparseRowMatrix(m, n)
+
+ // read the values back into the matrix
+ var j = 0
+ // row wise, copy any non-zero elements from row(i-1,::)
+ for (i <- 1 to m) {
+ // for each nonzero element, set column col(idx(j) value to vals(j)
+ while (j < row_ptr(i)) {
+ srMx(i - 1, col_idx(j)) = values(j)
+ j += 1
+ }
+ }
+ srMx
+ }
+
+}
diff --git a/cuda/src/test/scala/org/apache/mahout/cuda/CUDATestSuite.scala b/cuda/src/test/scala/org/apache/mahout/cuda/CUDATestSuite.scala
new file mode 100644
index 0000000000..5222cc1106
--- /dev/null
+++ b/cuda/src/test/scala/org/apache/mahout/cuda/CUDATestSuite.scala
@@ -0,0 +1,77 @@
+package org.apache.mahout.cuda
+
+import org.scalatest.{FunSuite, Matchers}
+import org.apache.mahout.math._
+import scalabindings.RLikeOps._
+
+import scala.util.Random
+
+/**
+ * Created by andy on 3/29/17.
+ */
+class CUDATestSuite extends FunSuite with Matchers {
+
+
+ test("sparse mmul at geometry of 1000 x 1000 %*% 1000 x 1000 density = .2. 5 runs") {
+ CUDATestSuite.getAverageTime(1000, 1000, 1000, .20, 1234L, 3)
+ }
+ test("sparse mmul at geometry of 1000 x 1000 %*% 1000 x 1000 density = .02. 5 runs") {
+ CUDATestSuite.getAverageTime(1000, 1000, 1000, .02, 1234L, 3)
+ }
+ test("sparse mmul at geometry of 1000 x 1000 %*% 1000 x 1000 density = .002. 5 runs") {
+ CUDATestSuite.getAverageTime(1000, 1000, 1000, .002, 1234L, 3)
+ }
+}
+
+
+object CUDATestSuite {
+ def getAverageTime(m: Int = 1000,
+ s: Int = 1000,
+ n: Int = 1000,
+ density: Double = .2,
+ seed: Long = 1234L,
+ nruns: Int = 5): Long = {
+
+ val r = new Random(seed)
+ val cudaCtx = new Context()
+
+ // sparse row-wise
+ val mxA = new SparseRowMatrix(m, s, false)
+ val mxB = new SparseRowMatrix(s, n, true)
+
+ // add some sparse data with the given threshold
+ mxA := { (_, _, v) => if (r.nextDouble() < density) r.nextDouble() else v }
+ mxB := { (_, _, v) => if (r.nextDouble() < density) r.nextDouble() else v }
+
+ // run Mahout JVM - only math once
+ var mxC = mxA %*% mxB
+
+ // run Mahout JVM - only math another {{nruns}} times and take average
+ var ms = System.currentTimeMillis()
+ for (i: Int <- 1 to nruns) {
+ mxC = mxA %*% mxB
+ }
+ ms = (System.currentTimeMillis() - ms) / nruns
+ print(s"Mahout JVM Sparse multiplication time: $ms ms.\n")
+
+
+ // run Mahout JCuda math bindings once
+ val cudaA = toCudaCmpMatrix(mxA, cudaCtx)
+ val cudaB = toCudaCmpMatrix(mxB, cudaCtx)
+ var mxCuda = prod(cudaA, cudaB, cudaCtx)
+
+ // run Mahout JCuda another {{nruns}} times and take average
+ ms = System.currentTimeMillis()
+ for (i: Int <- 1 to nruns) {
+ mxCuda = prod(cudaA, cudaB, cudaCtx)
+ }
+
+ ms = (System.currentTimeMillis() - ms) / nruns
+ print(s"Mahout JCuda Sparse multiplication time: $ms ms.\n")
+
+ // TODO: Ensure that we've been working with the same matrices.
+ // (mxC - mxCuda).norm / mxC.nrow / mxC.ncol should be < 1e-16
+ ms
+ }
+
+}
diff --git a/cuda/src/test/scala/org/apache/mahout/cuda/UserSetCUDATestSuite.scala b/cuda/src/test/scala/org/apache/mahout/cuda/UserSetCUDATestSuite.scala
new file mode 100644
index 0000000000..c718986178
--- /dev/null
+++ b/cuda/src/test/scala/org/apache/mahout/cuda/UserSetCUDATestSuite.scala
@@ -0,0 +1,98 @@
+package org.apache.mahout.cuda
+
+import org.scalatest.{FunSuite, Matchers}
+import org.apache.mahout.math._
+import scalabindings.RLikeOps._
+import CUDATestSuite._
+import scala.util.Properties.envOrElse
+
+import scala.util.Random
+
+
+import scala.util.Random
+/**
+ * Created by andy on 3/29/17.
+ */
+
+// some quickfixes as well
+class UserSetCUDATestSuite extends FunSuite with Matchers {
+
+ // defaults
+ var m: Int = 1000
+ var s: Int = 1000
+ var n: Int = 1000
+ var density: Double = .2
+ var seed: Long = 1234L
+ var num_runs: Int = 5
+
+ // grab the environment variables if set.
+ m = envOrElse("SIZE_M","1000").toInt
+ s = envOrElse("SIZE_S","1000").toInt
+ n = envOrElse("SIZE_N","1000").toInt
+ density = envOrElse("DENSITY",".02").toDouble
+ seed = envOrElse("SEED","1234").toLong
+ num_runs = envOrElse("NUM_RUNS","3").toInt
+
+ test("User Defined sparse mmul at geometry of "
+ + m + " x " + s + " %*% " + s + " x " + n + " density = " + density + " " + num_runs + " runs \n") {
+
+ val ms = getAverageTime(m, n, s, density, seed, num_runs)
+
+ println("User Defined sparse mmul at geometry of "
+ + m + " x " + s + " %*% " + s + " x " + n + " density = " + density + " " + num_runs + " runs : "+ms +" ms")
+ }
+}
+
+
+object UserSetCUDATestSuite {
+ def getAverageTime(m: Int = 1000,
+ s: Int = 1000,
+ n: Int = 1000,
+ density: Double = .2,
+ seed: Long = 1234L,
+ nruns: Int = 5): Long = {
+
+ val r = new Random(seed)
+ val cudaCtx = new Context()
+
+ // sparse row-wise
+ val mxA = new SparseRowMatrix(m, s, false)
+ val mxB = new SparseRowMatrix(s, n, true)
+
+ // add some sparse data with the given threshold
+ mxA := { (_, _, v) => if (r.nextDouble() < density) r.nextDouble() else v }
+ mxB := { (_, _, v) => if (r.nextDouble() < density) r.nextDouble() else v }
+
+ // run Mahout JVM - only math once
+ var mxC = mxA %*% mxB
+
+ // run Mahout JVM - only math another {{nruns}} times and take average
+ var ms = System.currentTimeMillis()
+ for (i: Int <- 1 to nruns) {
+ mxC = mxA %*% mxB
+ }
+ ms = (System.currentTimeMillis() - ms) / nruns
+ print(s"Mahout JVM Sparse multiplication time: $ms ms.")
+
+
+ // run Mahout JCuda math bindings once
+ val cudaA = toCudaCmpMatrix(mxA, cudaCtx)
+ val cudaB = toCudaCmpMatrix(mxB, cudaCtx)
+ var mxCuda = prod(cudaA, cudaB, cudaCtx)
+
+ // run Mahout JCuda another {{nruns}} times and take average
+ ms = System.currentTimeMillis()
+ for (i: Int <- 1 to nruns) {
+ mxCuda = prod(cudaA, cudaB, cudaCtx)
+ }
+
+ ms = (System.currentTimeMillis() - ms) / nruns
+ print(s"Mahout JCuda Sparse multiplication time: $ms ms.")
+
+ // TODO: Ensure that we've been working with the same matrices.
+ // (mxC - mxCuda).norm / mxC.nrow / mxC.ncol should be < 1e-16
+ ms
+ }
+
+}
+
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala b/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala
index 0904ea5c11..7865bfacf2 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala
@@ -56,6 +56,14 @@ final object RootSolverFactory extends SolverFactory {
def getOperator[C: ClassTag]: MMBinaryFunc = {
+ try {
+ logger.info("Creating scala.cuda.GPUMMul solver")
+ clazz = Class.forName("scala.cuda.GPUMMul$").getField("MODULE$").get(null).asInstanceOf[MMBinaryFunc]
+ logger.info("Successfully created scala.cuda.GPUMMul solver")
+
+ } catch {
+ case cudax: Exception =>
+ logger.info("Unable to create class GPUMMul with CUDA: attempting OpenCL version")
try {
logger.info("Creating org.apache.mahout.viennacl.opencl.GPUMMul solver")
clazz = Class.forName("org.apache.mahout.viennacl.opencl.GPUMMul$").getField("MODULE$").get(null).asInstanceOf[MMBinaryFunc]
@@ -79,6 +87,7 @@ final object RootSolverFactory extends SolverFactory {
clazz = MMul
}
}
+ }
clazz
}
}
diff --git a/pom.xml b/pom.xml
index 6e249a14e6..6657920947 100644
--- a/pom.xml
+++ b/pom.xml
@@ -244,6 +244,12 @@
${project.version}
+
+ mahout-native-cuda_${scala.compat.version}
+ ${project.groupId}
+ ${project.version}
+
+
mahout-native-viennacl_${scala.compat.version}
${project.groupId}
@@ -849,6 +855,13 @@
+
+ cuda
+
+ cuda
+
+
+
viennacl
diff --git a/viennacl-omp/pom.xml b/viennacl-omp/pom.xml
index b0ecd33e90..30674bb0cb 100644
--- a/viennacl-omp/pom.xml
+++ b/viennacl-omp/pom.xml
@@ -203,11 +203,8 @@
-
-
-