diff --git a/.travis.yml b/.travis.yml
index 74770551e..164f4bdbf 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,29 +12,20 @@ matrix:
       compiler: gcc
 
     - os: linux
-      env: CHANNEL=devel
+      env: CHANNEL=stable BACKEND=OpenCL
       compiler: gcc
 
-    # For faster testing we don't test clang on linux, only on macOS
-    # - os: linux
-    #   env: CHANNEL=stable
-    #   compiler: clang
-    #
-    # - os: linux
-    #   env: CHANNEL=devel
-    #   compiler: clang
+    - os: linux
+      env: CHANNEL=devel
+      compiler: gcc
 
     # On OSX we only test against clang (gcc is mapped to clang by default)
     # Note: for OpenMP, Homebrew will build flame/blis with GCC-5
+    # As BLIS is in OSX homebrew, this is an opportunity to test it as well
     - os: osx
       env: CHANNEL=stable BLIS=true
       compiler: clang
 
-    # For faster testing, we only test BLIS = true
-    # - os: osx
-    #   env: CHANNEL=stable BLIS=false
-    #   compiler: clang
-
   allow_failures:
     # Ignore failures when building against the devel Nim branch
     # Also ignore OSX, due to very long build time and Homebrew/curl SSLRead errors
@@ -52,6 +43,13 @@ before_install:
   # On MacOS flame/blis can be tested as it is an homebrew package
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update          ; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install brewsci/science/blis; fi
+  - if [[ "$BACKEND" == "OpenCL" ]]; then
+      bash ci/opencl_amd_sdk.sh;
+
+      wget https://launchpad.net/~cnugteren/+archive/ubuntu/clblast/+files/libclblast_1.3.0-1ubuntu2_amd64.deb -O libclblast.deb;
+      sudo dpkg -i libclblast.deb;
+      sudo apt-get -f install;
+    fi
 
 install:
   - export CHOOSENIM_NO_ANALYTICS=1
@@ -63,7 +61,11 @@ install:
 
 script:
     - nimble refresh
-    - nimble test
+    - if [[ "$BACKEND" == "OpenCL" ]]; then
+        nimble test_opencl;
+      else
+        nimble test;
+      fi
 
 branches:
   except:
diff --git a/ci/README.md b/ci/README.md
new file mode 100644
index 000000000..a64a1fd2a
--- /dev/null
+++ b/ci/README.md
@@ -0,0 +1,3 @@
+# Continuous Integration
+
+Scripts needed for continuous integration of Arraymancer
\ No newline at end of file
diff --git a/ci/opencl_amd_sdk.sh b/ci/opencl_amd_sdk.sh
new file mode 100644
index 000000000..ae6f49bb7
--- /dev/null
+++ b/ci/opencl_amd_sdk.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Original script from https://github.com/gregvw/amd_sdk/
+
+# Location from which get nonce and file name from
+URL="https://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/"
+URLDOWN="https://developer.amd.com/amd-license-agreement-appsdk/"
+
+NONCE1_STRING='name="amd_developer_central_downloads_page_nonce"'
+FILE_STRING='name="f"'
+POSTID_STRING='name="post_id"'
+NONCE2_STRING='name="amd_developer_central_nonce"'
+
+#For newest FORM=`wget -qO - $URL | sed -n '/download-2/,/64-bit/p'`
+FORM=`wget --no-check-certificate -qO - $URL | sed -n '/download-5/,/64-bit/p'`
+
+# Get nonce from form
+NONCE1=`echo $FORM | awk -F ${NONCE1_STRING} '{print $2}'`
+NONCE1=`echo $NONCE1 | awk -F'"' '{print $2}'`
+echo $NONCE1
+
+# get the postid
+POSTID=`echo $FORM | awk -F ${POSTID_STRING} '{print $2}'`
+POSTID=`echo $POSTID | awk -F'"' '{print $2}'`
+echo $POSTID
+
+# get file name
+FILE=`echo $FORM | awk -F ${FILE_STRING} '{print $2}'`
+FILE=`echo $FILE | awk -F'"' '{print $2}'`
+echo $FILE
+
+FORM=`wget --no-check-certificate -qO - $URLDOWN --post-data "amd_developer_central_downloads_page_nonce=${NONCE1}&f=${FILE}&post_id=${POSTID}"`
+
+NONCE2=`echo $FORM | awk -F ${NONCE2_STRING} '{print $2}'`
+NONCE2=`echo $NONCE2 | awk -F'"' '{print $2}'`
+echo $NONCE2
+
+wget --no-check-certificate --content-disposition --trust-server-names $URLDOWN --post-data "amd_developer_central_nonce=${NONCE2}&f=${FILE}" -O AMD-SDK.tar.bz2;
+
+# unpacking and installing
+tar -xjf AMD-SDK.tar.bz2
+AMDAPPSDK=${HOME}/AMDAPPSDK
+export OPENCL_VENDOR_PATH=${AMDAPPSDK}/etc/OpenCL/vendors
+mkdir -p ${OPENCL_VENDOR_PATH}
+sh AMD-APP-SDK*.sh --tar -xf -C ${AMDAPPSDK}
+echo libamdocl64.so > ${OPENCL_VENDOR_PATH}/amdocl64.icd
+export LD_LIBRARY_PATH=${AMDAPPSDK}/lib/x86_64:${LD_LIBRARY_PATH}
+chmod +x ${AMDAPPSDK}/bin/x86_64/clinfo
+
+# Checking OpenCL status
+${AMDAPPSDK}/bin/x86_64/clinfo
\ No newline at end of file
diff --git a/src/tensor/operators_blas_l1_opencl.nim b/src/tensor/operators_blas_l1_opencl.nim
index 9aca3e3bb..4d0416d91 100644
--- a/src/tensor/operators_blas_l1_opencl.nim
+++ b/src/tensor/operators_blas_l1_opencl.nim
@@ -27,7 +27,7 @@ import  ./backend/metadataArray,
 # ####################################################################
 # BLAS Level 1 (Vector dot product, Addition, Scalar to Vector/Matrix)
 
-template dotImpl(T: typedesc[SomeReal], clblast_proc: untyped): untyped =
+template dotImpl(T: typedesc, clblast_proc: untyped): untyped =
   proc dot*(a, b: ClTensor[T]): T =
     ## Vector to Vector dot (scalar) product
     when compileOption("boundChecks"):
diff --git a/src/tensor/operators_blas_l2l3_opencl.nim b/src/tensor/operators_blas_l2l3_opencl.nim
index 18f5aaa1e..e42fb485c 100644
--- a/src/tensor/operators_blas_l2l3_opencl.nim
+++ b/src/tensor/operators_blas_l2l3_opencl.nim
@@ -7,37 +7,36 @@ import  ./data_structure,
         ./private/[p_init_opencl, p_checks]
 
 
-template l1l2_blas_Impl(T: typedesc[SomeReal], clblast_gemv_proc: untyped): untyped =
-  proc openCL_MV_y_eq_aAx_p_by(
-    alpha: T, a, x: ClTensor[T],
-    beta: T, y: var ClTensor[T]) =
-    # Matrix-Vector: y = alpha A matvecmul x + beta y
-
-    # TODO: remove this contiguous layout constraint
-    if not a.isContiguous:
-      raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous")
-
-    let
-      a_is_rowMajor = a.is_C_contiguous
-      layout =  if a_is_rowMajor: CLBlastLayoutRowMajor
-                else: CLBlastLayoutColMajor
-      lda = if a_is_rowMajor: a.strides[0]
-            else: a.strides[1]
-
-    check clblast_gemv_proc(layout, CLBlastTransposeNo, a.shape[0], a.shape[1],
-                alpha,
-                a.toClPointer, a.offset, lda,
-                x.toClpointer, x.offset, x.strides[0],
-                beta,
-                y.toClpointer, y.offset, y.strides[0],
-                unsafeAddr clQueue0, nil)
-
-l1l2_blas_Impl(float32, clblastSgemv)
-l1l2_blas_Impl(float64, clblastDgemv)
+
+proc openCL_MV_y_eq_aAx_p_by(
+  alpha: float32, a, x: ClTensor[float32],
+  beta: float32, y: var ClTensor[float32]) =
+  # Matrix-Vector: y = alpha A matvecmul x + beta y
+
+  # TODO: remove this contiguous layout constraint
+  if not a.isContiguous:
+    raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous")
+
+  let
+    a_is_rowMajor = a.is_C_contiguous
+    layout =  if a_is_rowMajor: CLBlastLayoutRowMajor
+              else: CLBlastLayoutColMajor
+    lda = if a_is_rowMajor: a.strides[0]
+          else: a.strides[1]
+
+  check clblastSgemv(layout, CLBlastTransposeNo, a.shape[0], a.shape[1],
+              alpha,
+              a.toClPointer, a.offset, lda,
+              x.toClpointer, x.offset, x.strides[0],
+              beta,
+              y.toClpointer, y.offset, y.strides[0],
+              unsafeAddr clQueue0, nil)
+
 
 proc `*`*[T: SomeReal](a, b: ClTensor[T]): ClTensor[T] =
   ## Matrix multiplication (Matrix-Matrix and Matrix-Vector) on CUDA
 
+  assert T is float32, "Only float32 is supported at the moment"
   assert b.rank == 1, "Only Matrix-Vector product is supported at the moment"
 
   if a.rank == 2 and b.rank == 1:
diff --git a/tests/tensor/test_operators_blas_opencl.nim b/tests/tensor/test_operators_blas_opencl.nim
index 78361398e..faaaf496f 100644
--- a/tests/tensor/test_operators_blas_opencl.nim
+++ b/tests/tensor/test_operators_blas_opencl.nim
@@ -17,27 +17,18 @@ import ../../src/arraymancer
 import unittest
 
 suite "OpenCL BLAS operations (Basic Linear Algebra Subprograms)":
-  test "GEMV - General Matrix to Vector Multiplication - float32":
+  test "GEMV - General Matrix to Vector Multiplication":
     ## TODO: test with slices
     ## TODO: support and test non-contiguous tensors
 
-    let d = [[float32 1,-1,2], [float32 0.0,-3,1]].toTensor().opencl()
-    let e = [float32 2, 1, 0].toTensor().opencl()
+    let d = @[@[1.0'f32,-1,2],@[0.0'f32,-3,1]].toTensor().opencl()
+    let e = @[2.0'f32, 1, 0].toTensor().opencl()
 
-    check: (d * e).cpu ==  [float32 1, -3].toTensor()
-
-  test "GEMV - General Matrix to Vector Multiplication - float64":
-    ## TODO: test with slices
-    ## TODO: support and test non-contiguous tensors
-
-    let d = [[float64 1,-1,2], [float64 0.0,-3,1]].toTensor().opencl()
-    let e = [float64 2, 1, 0].toTensor().opencl()
-
-    check: (d * e).cpu ==  [float64 1, -3].toTensor()
+    check: (d * e).cpu ==  [1.0'f32, -3].toTensor()
 
   test "Matrix and vector addition":
-    let u = @[float32 1, 3, -5].toTensor.opencl
-    let v = @[float32 1, 1, 1].toTensor.opencl
+    let u = @[1'f32, 3, -5].toTensor.opencl
+    let v = @[1'f32, 1, 1].toTensor.opencl
 
     check: (u + v).cpu == @[2'f32, 4, -4].toTensor()
 
@@ -53,10 +44,10 @@ suite "OpenCL BLAS operations (Basic Linear Algebra Subprograms)":
       discard a + b.cpu[0..1, 0..1].opencl
 
   test "Matrix and vector substraction":
-    let u = @[float32 1, 3, -5].toTensor.opencl
-    let v = @[float32 1, 1, 1].toTensor.opencl
+    let u = @[1'f32, 3, -5].toTensor.opencl
+    let v = @[1'f32, 1, 1].toTensor.opencl
 
-    check: (u - v).cpu == @[float32 0, 2, -6].toTensor()
+    check: (u - v).cpu == @[0'f32, 2, -6].toTensor()
 
     let a = @[7.0, 4.0, 3.0, 1.0, 8.0, 6.0, 8.0, 1.0, 6.0, 2.0].toTensor.reshape([5,2]).opencl
     let b = @[6.0, 6.0, 2.0, 0.0, 4.0, 3.0, 2.0, 0.0, 0.0, 3.0].toTensor.reshape([5,2]).opencl
@@ -77,8 +68,8 @@ suite "OpenCL BLAS operations (Basic Linear Algebra Subprograms)":
 
 
   test "Matrix and Vector in-place addition":
-    var u = @[float64 1, 3, -5].toTensor().opencl()
-    let v = @[float64 4, -2, -1].toTensor().opencl()
+    var u = @[1'f64, 3, -5].toTensor().opencl()
+    let v = @[4'f64, -2, -1].toTensor().opencl()
 
     u += v
 
@@ -116,8 +107,8 @@ suite "OpenCL BLAS operations (Basic Linear Algebra Subprograms)":
       z += t2.cpu[0..1,0..1].opencl
 
   test "Matrix and Vector in-place substraction":
-    var u = @[float32 1, 3, -5].toTensor.opencl
-    let v = @[float32 1, 1, 1].toTensor.opencl
+    var u = @[1'f32, 3, -5].toTensor.opencl
+    let v = @[1'f32, 1, 1].toTensor.opencl
 
     u -= v
 
@@ -142,8 +133,8 @@ suite "OpenCL BLAS operations (Basic Linear Algebra Subprograms)":
       a += b.cpu[0..1,0..1].opencl
 
   test "Matrix and vector addition":
-    let u = @[float32 1, 3, -5].toTensor.opencl
-    let v = @[float32 1, 1, 1].toTensor.opencl
+    let u = @[1'f32, 3, -5].toTensor.opencl
+    let v = @[1'f32, 1, 1].toTensor.opencl
 
     check: (u + v).cpu == @[2'f32, 4, -4].toTensor()
 
@@ -159,8 +150,8 @@ suite "OpenCL BLAS operations (Basic Linear Algebra Subprograms)":
       discard a + b.cpu[0..1, 0..1].opencl
 
   test "Matrix and vector substraction":
-    let u = @[float32 1, 3, -5].toTensor.opencl
-    let v = @[float32 1, 1, 1].toTensor.opencl
+    let u = @[1'f32, 3, -5].toTensor.opencl
+    let v = @[1'f32, 1, 1].toTensor.opencl
 
     check: (u - v).cpu == @[0'f32, 2, -6].toTensor()