From 0efb3eb450409dd3e0ef8b433b31d35a9887de57 Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Thu, 22 Dec 2016 17:26:24 +0100 Subject: [PATCH 1/3] OpenCL improvements Added Tile, Transpose and Range Ops double support for SYCL device. Moved gpu_device_name() to test_util.py so now it can be used in force_gpu to pull either GPU or SYCL depending on what is available in the system. --- tensorflow/core/kernels/sequence_ops.cc | 9 ++++++++- tensorflow/core/kernels/tile_ops.cc | 7 +++++++ tensorflow/core/kernels/tile_ops_cpu_impl.h | 2 ++ tensorflow/core/kernels/transpose_functor_cpu.cc | 1 + tensorflow/python/framework/test_util.py | 11 +++++++++-- tensorflow/python/platform/test.py | 10 +--------- 6 files changed, 28 insertions(+), 12 deletions(-) diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc index c24ecdf8b97..c8ea9230201 100644 --- a/tensorflow/core/kernels/sequence_ops.cc +++ b/tensorflow/core/kernels/sequence_ops.cc @@ -92,9 +92,11 @@ class RangeOp : public OpKernel { #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T) TF_CALL_float(REGISTER_SYCL_KERNEL); +TF_CALL_double(REGISTER_SYCL_KERNEL); TF_CALL_int32(REGISTER_SYCL_KERNEL); TF_CALL_int64(REGISTER_SYCL_KERNEL); -#endif // TENSORFLOW_USE_SYCL +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL TF_CALL_float(REGISTER_CPU_KERNEL); TF_CALL_double(REGISTER_CPU_KERNEL); @@ -170,4 +172,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL); TF_CALL_float(REGISTER_GPU_KERNEL); TF_CALL_double(REGISTER_GPU_KERNEL); +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T) +TF_CALL_float(REGISTER_SYCL_KERNEL); +TF_CALL_double(REGISTER_SYCL_KERNEL); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc index 36cabaaf7d3..88e05d97b28 100644 --- a/tensorflow/core/kernels/tile_ops.cc +++ b/tensorflow/core/kernels/tile_ops.cc @@ -258,6 +258,7 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU); #ifdef TENSORFLOW_USE_SYCL TF_CALL_float(HANDLE_TYPE_NAME_SYCL); +TF_CALL_double(HANDLE_TYPE_NAME_SYCL); #endif // TENSORFLOW_USE_SYCL #undef HANDLE_TYPE_NAME_CPU @@ -601,6 +602,12 @@ REGISTER_KERNEL_BUILDER(Name("Tile") .TypeConstraint("Tmultiples") .HostMemory("multiples"), TileOp); +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tmultiples") + .HostMemory("multiples"), + TileOp); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h index 650c739ed59..7ad709154b3 100644 --- a/tensorflow/core/kernels/tile_ops_cpu_impl.h +++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h @@ -70,6 +70,7 @@ typedef Eigen::SyclDevice SYCLDevice; #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM) TF_CALL_float(DEFINE_TYPE); +TF_CALL_double(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE @@ -81,6 +82,7 @@ TF_CALL_float(DEFINE_TYPE); #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM) TF_CALL_float(DEFINE_TYPE); +TF_CALL_double(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc index 30b82f18431..3681b9a1291 100644 --- a/tensorflow/core/kernels/transpose_functor_cpu.cc +++ b/tensorflow/core/kernels/transpose_functor_cpu.cc @@ -127,6 +127,7 @@ Status DoTranspose(const SYCLDevice& d, const Tensor& in, switch (in.dtype()) { case DT_FLOAT: + case DT_DOUBLE: case DT_INT32: internal::Transpose(d, in, perm, out); break; diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 33ca06f4668..a7f1b1cc4be 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -43,7 +43,14 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import compat from tensorflow.python.util.protobuf import compare +from tensorflow.python.client import device_lib +def gpu_device_name(): + """Returns the name of a GPU device if available or the empty string.""" + for x in device_lib.list_local_devices(): + if x.device_type == 'GPU' or x.device_type == 'SYCL': + return x.name + return '' def assert_ops_in_graph(expected_ops, graph): """Assert all expected operations are found. @@ -283,7 +290,7 @@ def prepare_config(config): sess = self._cached_session with sess.graph.as_default(), sess.as_default(): if force_gpu: - with sess.graph.device("/gpu:0"): + with sess.graph.device(gpu_device_name()): yield sess elif use_gpu: yield sess @@ -293,7 +300,7 @@ def prepare_config(config): else: with session.Session(graph=graph, config=prepare_config(config)) as sess: if force_gpu: - with sess.graph.device("/gpu:0"): + with sess.graph.device(gpu_device_name()): yield sess elif use_gpu: yield sess diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py index b6b06f9eb91..047e4151d13 100644 --- a/tensorflow/python/platform/test.py +++ b/tensorflow/python/platform/test.py @@ -72,6 +72,7 @@ def testSquare(self): # pylint: disable=unused-import from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase from tensorflow.python.framework.test_util import assert_equal_graph_def +from tensorflow.python.framework.test_util import gpu_device_name from tensorflow.python.ops.gradient_checker import compute_gradient_error from tensorflow.python.ops.gradient_checker import compute_gradient @@ -137,15 +138,6 @@ def is_gpu_available(cuda_only=False): return any((x.device_type == 'GPU' or x.device_type == 'SYCL') for x in _device_lib.list_local_devices()) - -def gpu_device_name(): - """Returns the name of a GPU device if available or the empty string.""" - for x in _device_lib.list_local_devices(): - if x.device_type == 'GPU' or x.device_type == 'SYCL': - return x.name - return '' - - _allowed_symbols = [ # We piggy-back googletest documentation. 'Benchmark', From a9ae05ebb1b1af9273daf340ec79f0d5681fd14e Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Sun, 1 Jan 2017 12:07:25 +0100 Subject: [PATCH 2/3] Improvements to the SYCL device support - Registration of Type Traits required for stride slice op - Registration of ConcatOffset, _ListToArray, _ArrayToList Pad, Reverse ( CPU ), ReverseV2 ( CPU ), Size, ExpandDims, Squeeze, StridedSlice, StridedSliceGrad, StridedSliceAssign, TileGrad, InvertPermutation, Transpose - Registration of Sycl kernels only for essential data types - Floor_div_real has been disabled for SYCL device - Device in control_flow_ops_py_test.py needed to be lower cased --- .../core/framework/register_types_traits.h | 19 +++++ tensorflow/core/kernels/concat_op.cc | 11 +++ tensorflow/core/kernels/constant_op.cc | 17 +++- tensorflow/core/kernels/cwise_op_acos.cc | 1 + tensorflow/core/kernels/cwise_op_add_1.cc | 5 +- tensorflow/core/kernels/cwise_op_asin.cc | 1 + tensorflow/core/kernels/cwise_op_atan.cc | 1 + tensorflow/core/kernels/cwise_op_ceil.cc | 1 + tensorflow/core/kernels/cwise_op_cos.cc | 1 + tensorflow/core/kernels/cwise_op_div.cc | 1 + tensorflow/core/kernels/cwise_op_floor.cc | 1 + tensorflow/core/kernels/cwise_op_floor_div.cc | 11 --- tensorflow/core/kernels/cwise_op_isfinite.cc | 1 + tensorflow/core/kernels/cwise_op_isinf.cc | 1 + tensorflow/core/kernels/cwise_op_isnan.cc | 1 + tensorflow/core/kernels/cwise_op_log.cc | 1 + tensorflow/core/kernels/cwise_op_log1p.cc | 1 + tensorflow/core/kernels/cwise_op_mul_1.cc | 8 ++ tensorflow/core/kernels/cwise_op_pow.cc | 1 + tensorflow/core/kernels/cwise_op_round.cc | 4 +- tensorflow/core/kernels/cwise_op_rsqrt.cc | 1 + tensorflow/core/kernels/cwise_op_sin.cc | 1 + tensorflow/core/kernels/cwise_op_sqrt.cc | 3 +- tensorflow/core/kernels/cwise_op_square.cc | 1 + tensorflow/core/kernels/cwise_op_tan.cc | 1 + tensorflow/core/kernels/cwise_op_tanh.cc | 1 + tensorflow/core/kernels/debug_ops.cc | 2 + tensorflow/core/kernels/dense_update_ops.cc | 1 + tensorflow/core/kernels/fill_functor.cc | 2 + tensorflow/core/kernels/function_ops.cc | 28 +++++++ tensorflow/core/kernels/pack_op.cc | 1 + tensorflow/core/kernels/pad_op.cc | 29 +++++++ tensorflow/core/kernels/reduction_ops_sum.cc | 1 - tensorflow/core/kernels/scatter_op.cc | 4 +- tensorflow/core/kernels/shape_ops.cc | 83 ++++++++++++++++++- tensorflow/core/kernels/strided_slice_op.cc | 67 +++++++++++++++ .../core/kernels/strided_slice_op_impl.h | 14 ++++ tensorflow/core/kernels/tile_ops.cc | 24 ++++++ tensorflow/core/kernels/tile_ops_cpu_impl.h | 2 + tensorflow/core/kernels/training_ops.cc | 2 + tensorflow/core/kernels/transpose_op.cc | 27 ++++++ tensorflow/core/kernels/transpose_op.h | 11 +++ tensorflow/core/kernels/unpack_op.cc | 1 + tensorflow/core/kernels/variable_ops.cc | 3 +- tensorflow/core/ops/math_grad_test.cc | 2 +- .../kernel_tests/control_flow_ops_py_test.py | 2 +- 46 files changed, 378 insertions(+), 24 deletions(-) diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h index 8f8d9fd08e6..c1fe5517c69 100644 --- a/tensorflow/core/framework/register_types_traits.h +++ b/tensorflow/core/framework/register_types_traits.h @@ -21,6 +21,10 @@ limitations under the License. typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL + #include "tensorflow/core/framework/numeric_types.h" #include "tensorflow/core/platform/types.h" @@ -66,6 +70,17 @@ struct proxy_type_pod { typedef Eigen::half type; }; +#ifdef TENSORFLOW_USE_SYCL +template <> +struct proxy_type_pod { + typedef double type; +}; +template <> +struct proxy_type_pod { + typedef float type; +}; +#endif // TENSORFLOW_USE_SYCL + /// If POD we use proxy_type_pod, otherwise this maps to identiy. template struct proxy_type { @@ -81,6 +96,10 @@ struct proxy_type { TF_CALL_int8(m) TF_CALL_complex128(m) #define TF_CALL_GPU_PROXY_TYPES(m) \ TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m) +#ifdef TENSORFLOW_USE_SYCL +#define TF_CALL_SYCL_PROXY_TYPES(m) \ + TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m) +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow #endif // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_ diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc index 5523362f17d..915d77c1c5a 100644 --- a/tensorflow/core/kernels/concat_op.cc +++ b/tensorflow/core/kernels/concat_op.cc @@ -35,6 +35,9 @@ typedef Eigen::ThreadPoolDevice CPUDevice; #if GOOGLE_CUDA typedef Eigen::GpuDevice GPUDevice; #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM }; @@ -292,4 +295,12 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset") .HostMemory("offset"), ConcatOffsetOp); +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("ConcatOffset") + .Device(DEVICE_SYCL) + .HostMemory("concat_dim") + .HostMemory("shape") + .HostMemory("offset"), + ConcatOffsetOp); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index 3f8717f77f3..7d716a2bd75 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -57,7 +57,9 @@ REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp); REGISTER_KERNEL_BUILDER( \ Name("Const").Device(DEVICE_SYCL).TypeConstraint("dtype"), \ ConstantOp); -TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL); +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); +REGISTER_SYCL_KERNEL(bool); #undef REGISTER_SYCL_KERNEL #endif @@ -112,6 +114,17 @@ REGISTER_KERNEL_BUILDER(Name("Const") HostConstantOp); #endif +#ifdef TENSORFLOW_USE_SYCL +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Const") + .Device(DEVICE_SYCL) + .HostMemory("output") + .TypeConstraint("dtype"), + HostConstantOp); +#endif // TENSORFLOW_USE_SYCL + typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL @@ -186,6 +199,7 @@ REGISTER_KERNEL(CPU, quint8); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL(SYCL, float) +REGISTER_KERNEL(SYCL, double) REGISTER_KERNEL_BUILDER(Name("Fill") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -245,6 +259,7 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL(float, SYCL); +REGISTER_KERNEL(bool, SYCL); REGISTER_KERNEL_BUILDER(Name("ZerosLike") .Device(DEVICE_SYCL) .TypeConstraint("T") diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc index 1d2d815027f..65801da3c7c 100644 --- a/tensorflow/core/kernels/cwise_op_acos.cc +++ b/tensorflow/core/kernels/cwise_op_acos.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc index a6bff78694a..820ad684ec1 100644 --- a/tensorflow/core/kernels/cwise_op_add_1.cc +++ b/tensorflow/core/kernels/cwise_op_add_1.cc @@ -18,7 +18,7 @@ limitations under the License. namespace tensorflow { REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32, int64); - + #if TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(TYPE) \ REGISTER_KERNEL_BUILDER( \ @@ -27,9 +27,10 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32, .TypeConstraint("T"), \ BinaryOp>); REGISTER_SYCL_KERNEL(float); + REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL - + #if GOOGLE_CUDA REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc index 92a22e90c4a..c9ebfe759b1 100644 --- a/tensorflow/core/kernels/cwise_op_asin.cc +++ b/tensorflow/core/kernels/cwise_op_asin.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc index 825e85283f4..72645b303fc 100644 --- a/tensorflow/core/kernels/cwise_op_atan.cc +++ b/tensorflow/core/kernels/cwise_op_atan.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc index c5a4aaf831f..c74e10576d5 100644 --- a/tensorflow/core/kernels/cwise_op_ceil.cc +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc index a758da58421..634c90adc63 100644 --- a/tensorflow/core/kernels/cwise_op_cos.cc +++ b/tensorflow/core/kernels/cwise_op_cos.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc index 74d8faedb5e..dcdddb918dc 100644 --- a/tensorflow/core/kernels/cwise_op_div.cc +++ b/tensorflow/core/kernels/cwise_op_div.cc @@ -37,6 +37,7 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double, .TypeConstraint("T"), \ BinaryOp>); REGISTER_SYCL_KERNEL(float) +REGISTER_SYCL_KERNEL(double) REGISTER_SYCL_KERNEL(int32) #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc index 129d754b826..59e32d7f6f4 100644 --- a/tensorflow/core/kernels/cwise_op_floor.cc +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc index 8a600f8f95e..92e488e1a48 100644 --- a/tensorflow/core/kernels/cwise_op_floor_div.cc +++ b/tensorflow/core/kernels/cwise_op_floor_div.cc @@ -21,17 +21,6 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16, REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float, Eigen::half, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("FloorDiv") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); -REGISTER_SYCL_KERNEL(float) -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16, int64); diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc index 59976141c78..0faeffa95ca 100644 --- a/tensorflow/core/kernels/cwise_op_isfinite.cc +++ b/tensorflow/core/kernels/cwise_op_isfinite.cc @@ -27,6 +27,7 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc index 675cb95b955..df63006b3fd 100644 --- a/tensorflow/core/kernels/cwise_op_isinf.cc +++ b/tensorflow/core/kernels/cwise_op_isinf.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc index c394087ed80..e1cf7a86375 100644 --- a/tensorflow/core/kernels/cwise_op_isnan.cc +++ b/tensorflow/core/kernels/cwise_op_isnan.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc index 71c4588b3de..5e74e778c76 100644 --- a/tensorflow/core/kernels/cwise_op_log.cc +++ b/tensorflow/core/kernels/cwise_op_log.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc index 03ea3a0a894..edb821318e8 100644 --- a/tensorflow/core/kernels/cwise_op_log1p.cc +++ b/tensorflow/core/kernels/cwise_op_log1p.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc index e23fe6761d7..5273522626b 100644 --- a/tensorflow/core/kernels/cwise_op_mul_1.cc +++ b/tensorflow/core/kernels/cwise_op_mul_1.cc @@ -28,7 +28,15 @@ REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, .TypeConstraint("T"), \ BinaryOp>); REGISTER_SYCL_KERNEL(float) +REGISTER_SYCL_KERNEL(double) #undef REGISTER_SYCL_KERNEL +REGISTER_KERNEL_BUILDER(Name("Mul") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); #endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double, diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc index 8eeba6ab14f..f1780168e45 100644 --- a/tensorflow/core/kernels/cwise_op_pow.cc +++ b/tensorflow/core/kernels/cwise_op_pow.cc @@ -27,6 +27,7 @@ REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32, .TypeConstraint("T"), \ BinaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc index 7a4482dbb2b..e192f89782d 100644 --- a/tensorflow/core/kernels/cwise_op_round.cc +++ b/tensorflow/core/kernels/cwise_op_round.cc @@ -20,9 +20,9 @@ REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double, int32, int64); #ifdef TENSORFLOW_USE_SYCL -REGISTER(UnaryOp, SYCL, "Round", functor::round, float); +REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double); namespace functor { -DEFINE_UNARY1(round, float); +DEFINE_UNARY2(round, float, double); } // namespace functor #endif diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc index 7dc96d47a60..f23725f48e3 100644 --- a/tensorflow/core/kernels/cwise_op_rsqrt.cc +++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc index 8d0c0959f74..ab54c61b56d 100644 --- a/tensorflow/core/kernels/cwise_op_sin.cc +++ b/tensorflow/core/kernels/cwise_op_sin.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc index 710001517b5..55acf648db0 100644 --- a/tensorflow/core/kernels/cwise_op_sqrt.cc +++ b/tensorflow/core/kernels/cwise_op_sqrt.cc @@ -27,8 +27,9 @@ REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc index f867f127a72..afcacfec1c7 100644 --- a/tensorflow/core/kernels/cwise_op_square.cc +++ b/tensorflow/core/kernels/cwise_op_square.cc @@ -27,6 +27,7 @@ REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc index ac49cad88fd..9c850c94207 100644 --- a/tensorflow/core/kernels/cwise_op_tan.cc +++ b/tensorflow/core/kernels/cwise_op_tan.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc index ae2c473e20b..1dbc13061ba 100644 --- a/tensorflow/core/kernels/cwise_op_tanh.cc +++ b/tensorflow/core/kernels/cwise_op_tanh.cc @@ -28,6 +28,7 @@ REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc index 0706b72a895..d0f5db3bf2c 100644 --- a/tensorflow/core/kernels/debug_ops.cc +++ b/tensorflow/core/kernels/debug_ops.cc @@ -97,6 +97,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double); .TypeConstraint("T"), \ DebugNanCountOp); REGISTER_GPU_DEBUG_NAN_COUNT(float); +REGISTER_GPU_DEBUG_NAN_COUNT(double); #endif // TENSORFLOW_USE_SYCL // Register debug numeric summary ops. @@ -129,6 +130,7 @@ REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double); .TypeConstraint("T"), \ DebugNumericSummaryOp); REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float); +REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc index b32d99e45c6..dc0c3b654dd 100644 --- a/tensorflow/core/kernels/dense_update_ops.cc +++ b/tensorflow/core/kernels/dense_update_ops.cc @@ -108,6 +108,7 @@ typedef Eigen::SyclDevice SYCLDevice; DenseUpdateOp); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc index 08ec4baff3e..0df8f9d3edf 100644 --- a/tensorflow/core/kernels/fill_functor.cc +++ b/tensorflow/core/kernels/fill_functor.cc @@ -62,6 +62,8 @@ void SetZeroFunctor::operator()( #define DEFINE_SETZERO_SYCL(T) \ template struct SetZeroFunctor; DEFINE_SETZERO_SYCL(float); +DEFINE_SETZERO_SYCL(bool); +DEFINE_SETZERO_SYCL(double); #undef DEFINE_SETZERO_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc index 9aa289c3c95..d08dec46d19 100644 --- a/tensorflow/core/kernels/function_ops.cc +++ b/tensorflow/core/kernels/function_ops.cc @@ -185,6 +185,34 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList") .TypeConstraint("T"), PassOn); +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint("T"),\ + PassOn); \ + REGISTER_KERNEL_BUILDER( \ + Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint("T"),\ + PassOn); + +REGISTER_SYCL_KERNELS(float); +REGISTER_SYCL_KERNELS(double); + +#undef REGISTER_SYCL_KERNELS + +REGISTER_KERNEL_BUILDER(Name("_ListToArray") + .Device(DEVICE_SYCL) + .HostMemory("input") + .HostMemory("output") + .TypeConstraint("T"), + PassOn); +REGISTER_KERNEL_BUILDER(Name("_ArrayToList") + .Device(DEVICE_SYCL) + .HostMemory("input") + .HostMemory("output") + .TypeConstraint("T"), + PassOn); +#endif // TENSORFLOW_USE_SYCL + class SymbolicGradientOp : public AsyncOpKernel { public: SymbolicGradientOp(OpKernelConstruction* ctx) diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc index d96bc376912..d3efd8bc1e0 100644 --- a/tensorflow/core/kernels/pack_op.cc +++ b/tensorflow/core/kernels/pack_op.cc @@ -167,6 +167,7 @@ REGISTER_KERNEL_BUILDER(Name("Pack") PackOp) REGISTER_SYCL(float); +REGISTER_SYCL(double); #undef REGISTER_SYCL // A special GPU kernel for int32. diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc index bec2d02cb5a..91984319c60 100644 --- a/tensorflow/core/kernels/pad_op.cc +++ b/tensorflow/core/kernels/pad_op.cc @@ -38,6 +38,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL template class PadOp : public OpKernel { @@ -199,4 +202,30 @@ REGISTER_KERNEL_BUILDER(Name("Pad") PadOp); #endif +#ifdef TENSORFLOW_USE_SYCL +// Registration of the GPU implementations. +#define REGISTER_SYCL_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("Pad") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .HostMemory("paddings"), \ + PadOp) + +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Pad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tpaddings") + .HostMemory("input") + .HostMemory("paddings") + .HostMemory("output"), + PadOp); +#endif // TENSORFLOW_USE_SYCL + } // end namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc index 3aa38f418ee..938ca66a0cb 100644 --- a/tensorflow/core/kernels/reduction_ops_sum.cc +++ b/tensorflow/core/kernels/reduction_ops_sum.cc @@ -74,7 +74,6 @@ REGISTER_KERNEL_BUILDER( .HostMemory("reduction_indices"), \ ReductionOp>); REGISTER_SYCL_KERNELS(float); -REGISTER_SYCL_KERNELS(double); #undef REGISTER_SYCL_KERNELS // A special GPU kernel for int32. diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc index 827eb7dbca7..51dad49cfec 100644 --- a/tensorflow/core/kernels/scatter_op.cc +++ b/tensorflow/core/kernels/scatter_op.cc @@ -180,8 +180,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU); #define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL); -REGISTER_SCATTER_ARITHEMTIC_SYCL(float); -REGISTER_SCATTER_UPDATE_SYCL(float); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL); #undef REGISTER_SCATTER_ARITHEMTIC_SYCL #undef REGISTER_SCATTER_UPDATE_SYCL diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc index 496865de029..5e467d542e0 100644 --- a/tensorflow/core/kernels/shape_ops.cc +++ b/tensorflow/core/kernels/shape_ops.cc @@ -272,6 +272,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"), .HostMemory("output"), \ RankOp); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL // A special GPU kernel for int32 and bool. @@ -388,6 +389,43 @@ REGISTER_KERNEL_BUILDER(Name("Size") SizeOp); #endif +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Size") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("out_type") \ + .HostMemory("output"), \ + SizeOp); \ + REGISTER_KERNEL_BUILDER(Name("Size") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("out_type") \ + .HostMemory("output"), \ + SizeOp); +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); +#undef REGISTER_SYCL_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Size") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("out_type") + .HostMemory("input") + .HostMemory("output"), + SizeOp); +REGISTER_KERNEL_BUILDER(Name("Size") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("out_type") + .HostMemory("input") + .HostMemory("output"), + SizeOp); +#endif // TENSORFLOW_USE_SYCL + class ExpandDimsOp : public OpKernel { public: explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} @@ -458,7 +496,30 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims") .HostMemory("dim") .HostMemory("output"), ExpandDimsOp); -#endif +#endif // GOOGLE_CUDA + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("ExpandDims") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tdim") \ + .HostMemory("dim"), \ + ExpandDimsOp); +REGISTER_SYCL_KERNEL(float) +REGISTER_SYCL_KERNEL(double) + +#undef REGISTER_SYCL_KERNEL + +REGISTER_KERNEL_BUILDER(Name("ExpandDims") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tdim") + .HostMemory("input") + .HostMemory("dim") + .HostMemory("output"), + ExpandDimsOp); +#endif // TENSORFLOW_USE_SYCL class SqueezeOp : public OpKernel { public: @@ -553,4 +614,24 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze") SqueezeOp); #endif +#if TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint("T"),\ + SqueezeOp); +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); +#undef REGISTER_SYCL_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Squeeze") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .HostMemory("input") + .HostMemory("output"), + SqueezeOp); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc index 43040d238e1..7b69e56852f 100644 --- a/tensorflow/core/kernels/strided_slice_op.cc +++ b/tensorflow/core/kernels/strided_slice_op.cc @@ -422,4 +422,71 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") #undef REGISTER_GPU #endif // GOOGLE_CUDA + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL(type) \ + REGISTER_KERNEL_BUILDER(Name("StridedSlice") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("begin") \ + .HostMemory("end") \ + .HostMemory("strides") \ + .TypeConstraint("Index"), \ + StridedSliceOp) \ + REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("shape") \ + .HostMemory("begin") \ + .HostMemory("end") \ + .HostMemory("strides") \ + .TypeConstraint("Index"), \ + StridedSliceGradOp)\ + REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("begin") \ + .HostMemory("end") \ + .HostMemory("strides") \ + .TypeConstraint("Index"), \ + StridedSliceAssignOp) + +REGISTER_SYCL(float); +REGISTER_SYCL(double); + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("StridedSlice") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Index") + .HostMemory("input") + .HostMemory("begin") + .HostMemory("end") + .HostMemory("strides") + .HostMemory("output"), + StridedSliceOp); +REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Index") + .HostMemory("shape") + .HostMemory("begin") + .HostMemory("end") + .HostMemory("strides") + .HostMemory("dy") + .HostMemory("output"), + StridedSliceGradOp); +REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Index") + .HostMemory("ref") + .HostMemory("begin") + .HostMemory("end") + .HostMemory("strides"), + StridedSliceAssignOp) +#undef REGISTER_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h index 31866d66773..907ab90063a 100644 --- a/tensorflow/core/kernels/strided_slice_op_impl.h +++ b/tensorflow/core/kernels/strided_slice_op_impl.h @@ -282,6 +282,20 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU); DECLARE_FOR_N_GPU(int32); #endif // END GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define PREVENT_FOR_N_SYCL(T) \ + PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM) + +#define DECLARE_FOR_N_SYCL(T) \ + INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM) + +TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL); +TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_SYCL); +DECLARE_FOR_N_SYCL(int32); + +#undef DECLARE_FOR_N_SYCL +#endif // TENSORFLOW_USE_SYCL + TF_CALL_POD_STRING_TYPES(DECLARE_FOR_N_CPU); DECLARE_FOR_N_CPU(bfloat16); diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc index 88e05d97b28..c5d9da0096d 100644 --- a/tensorflow/core/kernels/tile_ops.cc +++ b/tensorflow/core/kernels/tile_ops.cc @@ -259,6 +259,7 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU); #ifdef TENSORFLOW_USE_SYCL TF_CALL_float(HANDLE_TYPE_NAME_SYCL); TF_CALL_double(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int32(HANDLE_TYPE_NAME_SYCL); #endif // TENSORFLOW_USE_SYCL #undef HANDLE_TYPE_NAME_CPU @@ -503,6 +504,16 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU); TF_CALL_complex128(HANDLE_TYPE_NAME_GPU); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_SYCL +#define HANDLE_TYPE_NAME_SYCL(T) \ + HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum::value); + +TF_CALL_float(HANDLE_TYPE_NAME_SYCL); +TF_CALL_double(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int32(HANDLE_TYPE_NAME_SYCL); +#undef HANDLE_TYPE_NAME_SYCL +#endif // TENSORFLOW_USE_SYCL + #undef HANDLE_TYPE_NAME_CPU #undef HANDLE_TYPE_NAME_GPU #undef HANDLE_CASE_DIM @@ -608,6 +619,19 @@ REGISTER_KERNEL_BUILDER(Name("Tile") .TypeConstraint("Tmultiples") .HostMemory("multiples"), TileOp); + +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tmultiples") + .HostMemory("multiples"), + TileGradientOp); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tmultiples") + .HostMemory("multiples"), + TileGradientOp); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h index 7ad709154b3..f06cc5514c8 100644 --- a/tensorflow/core/kernels/tile_ops_cpu_impl.h +++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h @@ -71,6 +71,7 @@ typedef Eigen::SyclDevice SYCLDevice; TF_CALL_float(DEFINE_TYPE); TF_CALL_double(DEFINE_TYPE); +TF_CALL_int32(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE @@ -83,6 +84,7 @@ TF_CALL_double(DEFINE_TYPE); TF_CALL_float(DEFINE_TYPE); TF_CALL_double(DEFINE_TYPE); +TF_CALL_int32(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 2b39b750038..065953bc392 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -371,6 +371,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS #endif @@ -2168,6 +2169,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); #endif #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc index 67300c1e961..b9841bc56ed 100644 --- a/tensorflow/core/kernels/transpose_op.cc +++ b/tensorflow/core/kernels/transpose_op.cc @@ -82,6 +82,15 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation") .HostMemory("y"), InvertPermutationOp); +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("InvertPermutation") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .HostMemory("x") + .HostMemory("y"), + InvertPermutationOp); +#endif // TENSORFLOW_USE_SYCL + // output = TransposeOp(T input, T perm) takes a tensor // of type T and rank N, and a permutation of 0, 1, ..., N-1. It // shuffles the dimensions of the input tensor according to permutation. @@ -201,4 +210,22 @@ TF_CALL_POD_TYPES(REGISTER); #undef REGISTER #endif +#ifdef TENSORFLOW_USE_SYCL +Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in, + gtl::ArraySlice perm, Tensor* out) { + typedef Eigen::SyclDevice SYCLDevice; + return ::tensorflow::DoTranspose(ctx->eigen_device(), in, perm, + out); +} +#define REGISTER(T) \ + REGISTER_KERNEL_BUILDER(Name("Transpose") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tperm") \ + .HostMemory("perm"), \ + TransposeSyclOp); +REGISTER(float); +#undef REGISTER +#endif + } // namespace tensorflow diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h index 3b209c0ccc8..5f40bcecc18 100644 --- a/tensorflow/core/kernels/transpose_op.h +++ b/tensorflow/core/kernels/transpose_op.h @@ -50,6 +50,17 @@ class TransposeGpuOp : public TransposeOp { gtl::ArraySlice perm, Tensor* out) override; }; +#ifdef TENSORFLOW_USE_SYCL +class TransposeSyclOp : public TransposeOp { + public: + explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {} + + protected: + Status DoTranspose(OpKernelContext* ctx, const Tensor& in, + gtl::ArraySlice perm, Tensor* out) override; +}; +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow #endif // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_ diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc index 2a14fa32651..e4c79ae17bb 100644 --- a/tensorflow/core/kernels/unpack_op.cc +++ b/tensorflow/core/kernels/unpack_op.cc @@ -160,6 +160,7 @@ REGISTER_KERNEL_BUILDER(Name("Unpack") UnpackOp) REGISTER_SYCL(float); +REGISTER_SYCL(double); #undef REGISTER_SYCL // A special SYCL kernel for int32. diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc index 34e227156d8..7a4d9dc6503 100644 --- a/tensorflow/core/kernels/variable_ops.cc +++ b/tensorflow/core/kernels/variable_ops.cc @@ -58,8 +58,9 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU), IsVariableInitializedOp); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA // Only register 'Variable' on GPU for the subset of types also supported by diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc index 3fed5c30289..1d17258b188 100644 --- a/tensorflow/core/ops/math_grad_test.cc +++ b/tensorflow/core/ops/math_grad_test.cc @@ -390,7 +390,7 @@ class TestOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL TEST_F(MathGradTest, Error_Reporting) { auto x = test::AsTensor({-3.f}); diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py index 2891b915a48..7ac240bb68d 100644 --- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py +++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py @@ -1338,7 +1338,7 @@ def b1(i, x): def _testWhileGrad_ColocateGradients(self, colocate): gpu_dev_name = test.gpu_device_name() if test.is_gpu_available() else "/gpu:0" - gpu_short_name = gpu_dev_name.split('/')[-1] + gpu_short_name = gpu_dev_name.split('/')[-1].lower() with self.test_session(graph=ops.Graph()) as sess: v = constant_op.constant(2.0, name="v") From f55f6f1484e45900352c54978a2b57a49bc47253 Mon Sep 17 00:00:00 2001 From: Kristofer Krus Date: Thu, 19 Jan 2017 22:35:07 +0100 Subject: [PATCH 3/3] Correct path and update version for ComputeCpp This commit will: * Correct path to bin folder for ComputeCpp * Update ComputeCpp version from 0.1.1 to 0.1.2 --- tensorflow/g3doc/get_started/os_setup.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md index bc07c442180..9c02e4087bd 100644 --- a/tensorflow/g3doc/get_started/os_setup.md +++ b/tensorflow/g3doc/get_started/os_setup.md @@ -684,11 +684,11 @@ website](https://www.codeplay.com/products/computesuite/computecpp), uncompress and copy the files into e.g. `/usr/local/computecpp`: ```bash -tar -xvzf ComputeCpp-CE-0.1.1-Ubuntu.14.04-64bit.tar.gz +tar -xvzf ComputeCpp-CE-0.1.2-Ubuntu.14.04-64bit.tar.gz sudo mkdir /usr/local/computecpp -sudo cp -R ComputeCpp-CE-0.1.1-Linux /usr/local/computecpp +sudo cp -R ComputeCpp-CE-0.1.2-Linux /usr/local/computecpp sudo chmod -R a+r /usr/local/computecpp/ -sudo chmod -R a+x /usr/local/computecpp/bin +sudo chmod -R a+x /usr/local/computecpp/ComputeCpp-CE-0.1.2-Linux/bin ``` ### Prepare environment for Mac OS X