Skip to content

Commit

Permalink
Sycl improvements (#44)
Browse files Browse the repository at this point in the history
- Eigen version bump
 - Extends Cast and Cwise ops benchmark to cover Sycl device
 - Extends device_lib_test.py to cover Sycl device
 - Registers int32, string and ResourceHandler to run on host for
   Enter and RefEnter Sycl Ops
 - Enables RecudeMax op for Sycl since Eigen implementation is ready
 - Registers Less op for Sycl device
  • Loading branch information
Luke Iwanski authored and benoitsteiner committed Feb 13, 2017
1 parent 91a06a9 commit b847171
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 5 deletions.
3 changes: 2 additions & 1 deletion tensorflow/core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ load(
"//tensorflow/core:platform/default/build_config_root.bzl",
"tf_cuda_tests_tags",
)
load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")

# -----------------------------------------------------------------------------
# Public targets
Expand Down Expand Up @@ -712,7 +713,7 @@ cc_library(
"//tensorflow/core/kernels:ops_testutil",
"//tensorflow/core/kernels:ops_util",
"//tensorflow/core/platform/default/build_config:gtest",
],
] + if_sycl([":sycl_runtime"]),
)

# This is a link-only library to provide a DirectSession
Expand Down
14 changes: 14 additions & 0 deletions tensorflow/core/kernels/cast_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(int64)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
#endif // TENSORFLOW_USE_SYCL
}
BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);

Expand All @@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(bool) + sizeof(float)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
#endif // TENSORFLOW_USE_SYCL
}
BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);

Expand Down Expand Up @@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
#endif // GOOGLE_CUDA
}
BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);

Expand All @@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
#endif // GOOGLE_CUDA
}
BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);

Expand Down
24 changes: 24 additions & 0 deletions tensorflow/core/kernels/control_flow_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,30 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);

#undef REGISTER_SYCL_KERNEL
#undef REGISTER_SYCL_REF_KERNEL
#define REGISTER_SYCL_HOST_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("Enter") \
.Device(DEVICE_SYCL) \
.HostMemory("data") \
.HostMemory("output") \
.TypeConstraint<type>("T"), \
EnterOp)

#define REGISTER_SYCL_HOST_REF_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("RefEnter") \
.Device(DEVICE_SYCL) \
.HostMemory("data") \
.HostMemory("output") \
.TypeConstraint<type>("T"), \
EnterOp)

REGISTER_SYCL_HOST_KERNEL(int32);
REGISTER_SYCL_HOST_REF_KERNEL(int32);
REGISTER_SYCL_HOST_KERNEL(string);
REGISTER_SYCL_HOST_REF_KERNEL(string);
REGISTER_SYCL_HOST_KERNEL(ResourceHandle);

#undef REGISTER_SYCL_HOST_KERNEL
#undef REGISTER_SYCL_HOST_REF_KERNEL
#endif

// Special GPU kernels for int32 and string.
Expand Down
10 changes: 10 additions & 0 deletions tensorflow/core/kernels/cwise_op_less.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);

REGISTER_KERNEL_BUILDER(Name("Less")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
51 changes: 51 additions & 0 deletions tensorflow/core/kernels/cwise_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,38 @@ static int ColsFromArg(int arg) { return (arg % kRows); }
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);

BM_UNARY(cpu, Floor, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_UNARY(gpu, Floor, float, DT_FLOAT);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_UNARY(sycl, Floor, float, DT_FLOAT);
#endif // TENSORFLOW_USE_SYCL

BM_UNARY(cpu, Floor, double, DT_DOUBLE);
#if GOOGLE_CUDA
BM_UNARY(gpu, Floor, double, DT_DOUBLE);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_UNARY(sycl, Floor, double, DT_DOUBLE);
#endif // TENSORFLOW_USE_SYCL

BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
#if GOOGLE_CUDA
BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
#endif // GOOGLE_CUDA
BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
#if GOOGLE_CUDA
BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
#endif // GOOGLE_CUDA

BM_UNARY(cpu, Rint, double, DT_DOUBLE);
#if GOOGLE_CUDA
BM_UNARY(gpu, Rint, double, DT_DOUBLE);
#endif // GOOGLE_CUDA
BM_UNARY(cpu, Rint, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_UNARY(gpu, Rint, float, DT_FLOAT);
#endif // GOOGLE_CUDA

// data func scalar.
static Graph* BinaryScalar(int num, const string& func) {
Expand Down Expand Up @@ -90,9 +110,20 @@ static Graph* BinaryScalar(int num, const string& func) {
->Arg(1048576);

BM_BINARY_SCALAR(cpu, Less);
#if GOOGLE_CUDA
BM_BINARY_SCALAR(gpu, Less);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BINARY_SCALAR(sycl, Less);
#endif // TENSORFLOW_USE_SYCL

BM_BINARY_SCALAR(cpu, Add);
#if GOOGLE_CUDA
BM_BINARY_SCALAR(gpu, Add);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BINARY_SCALAR(sycl, Add);
#endif // TENSORFLOW_USE_SYCL
#undef BM_BINARY_SCALAR

template <class T>
Expand Down Expand Up @@ -130,9 +161,13 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {

using Eigen::half;
BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
#endif // GOOGLE_CUDA
BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
#if GOOGLE_CUDA
BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
#endif // GOOGLE_CUDA
#undef BM_BIAS_ADD_ALL
#undef BM_BIAS_ADD

Expand Down Expand Up @@ -180,12 +215,18 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);

using Eigen::half;
#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
#endif // GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
#endif // GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
#endif // GOOGLE_CUDA
#undef BM_BIAS_ADD_GRAD_ALL
#undef BM_BIAS_ADD_GRAD

Expand Down Expand Up @@ -223,7 +264,12 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
BM_BCAST_ADD_ROW_ALL(cpu);
#if GOOGLE_CUDA
BM_BCAST_ADD_ROW_ALL(gpu);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BCAST_ADD_ROW_ALL(sycl);
#endif // TENSORFLOW_USE_SYCL
#undef BM_BCAST_ADD_ROW_ALL
#undef BM_BCAST_ADD_ROW

Expand All @@ -244,7 +290,12 @@ BM_BCAST_ADD_ROW_ALL(gpu);
BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
BM_BCAST_ADD_COL(DEVICE, 4096, 512);
BM_BCAST_ADD_COL_ALL(cpu);
#if GOOGLE_CUDA
BM_BCAST_ADD_COL_ALL(gpu);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BCAST_ADD_COL_ALL(sycl);
#endif // TENSORFLOW_USE_SYCL
#undef BM_BCAST_ADD_COL_ALL
#undef BM_BCAST_ADD_COL

Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/kernels/reduction_ops_max.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(
.TypeConstraint<int32>("Tidx") \
.HostMemory("reduction_indices"), \
ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
// REGISTER_SYCL_KERNELS(float);
REGISTER_SYCL_KERNELS(float);
#undef REGISTER_SYCL_KERNELS

REGISTER_KERNEL_BUILDER(
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/python/client/device_lib_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def testListLocalDevices(self):
# GPU test
if test.is_gpu_available():
self.assertGreater(len(devices), 1)
self.assertTrue("GPU" in [d.device_type for d in devices])
self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/workspace.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
name = "eigen_archive",
urls = [
#"http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/60578b474802.tar.gz",
"https://bitbucket.org/benoitsteiner/opencl/get/5c067614e3e1.tar.gz",
"https://bitbucket.org/benoitsteiner/opencl/get/796628790f36.tar.gz",
],
#sha256 = "7527cda827aff351981ebd910012e16be4d899c28a9ae7f143ae60e7f3f7b83d",
strip_prefix = "benoitsteiner-opencl-5c067614e3e1",
strip_prefix = "benoitsteiner-opencl-796628790f36",
build_file = str(Label("//third_party:eigen.BUILD")),
)

Expand Down

0 comments on commit b847171

Please sign in to comment.