diff --git a/src/kernels/pad.cpp b/src/kernels/pad.cpp
index cc7f30d..6b2f098 100644
--- a/src/kernels/pad.cpp
+++ b/src/kernels/pad.cpp
@@ -29,8 +29,8 @@ using namespace timer;
 namespace cu {
 
   /**
-   * Perform border padding with constant (0) or replication on src and store it in dest.
-   * Accepts 3-channel 32-bit float matrices.
+   * Perform border padding with constant (0) or replication on src.
+   * Accepts 3-channel 32-bit float matrices. Returns pointer to device dest.
    *
    * Params:
    *   src          input image.
@@ -42,13 +42,10 @@ namespace cu {
    *   right        right padding
    *   replicate    whether to replicate or constant
    */
-  void pad(
-      const cv::Mat& src, cv::Mat& dest, int top,
-      int bottom, int left, int right, bool replicate) {
-
-    if (src.type() != CV_32FC3) {
-      throw std::invalid_argument("pad: invalid input matrix type");
-    }
+  Npp32f* pad(
+      Npp32f* src,
+      int width, int height,
+      int top, int bottom, int left, int right, bool replicate) {
 
     // Compute time of relevant kernel
     double compute_time = 0.0;
@@ -57,17 +54,11 @@ namespace cu {
     int channels = 3;
     size_t elemSize = 3 * sizeof(float);
 
-    cv::Size sz = src.size();
-    int width   = sz.width;
-    int height  = sz.height;
     int destWidth = left + width + right;
     int destHeight = top + height + bottom;
 
     std::cout << "[start] pad: processing " << width << "x" << height << " image" << std::endl;
 
-    // pSrc pointer to image data
-    Npp32f* pHostSrc = (float*) src.data;
-
     // The width, in bytes, of the image, sometimes referred to as pitch
     unsigned int nSrcStep = width * elemSize;
     unsigned int nDstStep = destWidth * elemSize;
@@ -77,26 +68,15 @@ namespace cu {
     NppiSize oDstSizeROI = { destWidth, destHeight };
     const Npp32f padVal[3] = {0.0, 0.0, 0.0};
 
-    auto start_cuda_malloc = now();
-
     // Allocate device memory
-    Npp32f* pDeviceSrc, *pDeviceDst;
-
-    checkCudaErrors( cudaMalloc((void**) &pDeviceSrc, width * height * elemSize) );
+    auto start_cuda_malloc = now();
+    Npp32f* pDeviceDst;
     checkCudaErrors( cudaMalloc((void**) &pDeviceDst, destWidth * destHeight *  elemSize) );
-    checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );
-
+    if (!replicate)
+      checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );
     calc_print_elapsed("cudaMalloc", start_cuda_malloc);
 
-
-    auto start_memcpy_hd = now();
-
-    // Copy image to device
-    checkCudaErrors(
-        cudaMemcpy(pDeviceSrc, pHostSrc, width * height * elemSize, cudaMemcpyHostToDevice) );
-
-    calc_print_elapsed("cudaMemcpy H->D", start_memcpy_hd);
-
+    Npp32f* pDeviceSrc = src;
 
     auto start_pad = now();
 
@@ -112,22 +92,9 @@ namespace cu {
 
     compute_time += calc_print_elapsed("pad", start_pad);
 
-
-    auto start_memcpy_dh = now();
-
-    // Copy result to host
-    dest.create(destHeight, destWidth, CV_32FC3);
-
-    checkCudaErrors(
-        cudaMemcpy(dest.data, pDeviceDst,
-          destWidth * destHeight * elemSize, cudaMemcpyDeviceToHost) );
-
-    calc_print_elapsed("cudaMemcpy H<-D", start_memcpy_dh);
-
-    cudaFree((void*) pDeviceSrc);
-    cudaFree((void*) pDeviceDst);
-
     std::cout << "[done] pad: primary compute time: " << compute_time << " (ms)" << std::endl;
+
+    return pDeviceDst;
   }
 
 }
diff --git a/src/kernels/pad.h b/src/kernels/pad.h
index 662a30a..0c49e6c 100644
--- a/src/kernels/pad.h
+++ b/src/kernels/pad.h
@@ -30,8 +30,8 @@
 namespace cu {
 
   /**
-   * Perform border padding with constant (0) or replication on src and store it in dest.
-   * Accepts 3-channel 32-bit float matrices.
+   * Perform border padding with constant (0) or replication on src.
+   * Accepts 3-channel 32-bit float matrices. Returns pointer to dest
    *
    * Params:
    *   src        input image.
@@ -43,9 +43,10 @@ namespace cu {
    *   right      right padding
    *   replicate  whether to replicate or constant pad
    */
-  void pad(
-      const cv::Mat& src, cv::Mat& dest, int top,
-      int bottom, int left, int right, bool replicate);
+  Npp32f *pad(
+      Npp32f* src,
+      int width, int height,
+      int top, int bottom, int left, int right, bool replicate);
 
 }
 
diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp
index 79d9a98..311935c 100644
--- a/src/kernels/pyramid.cpp
+++ b/src/kernels/pyramid.cpp
@@ -30,8 +30,8 @@ using namespace timer;
 namespace cu {
 
   void constructImgPyramids(
-      const cv::Mat& I,
-      cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
+      Npp32f* src, float** Is, float** Ixs, float** Iys,
+      int width, int height,
       int padding, int nLevels) {
 
     // Timing
@@ -42,13 +42,6 @@ namespace cu {
     int channels = 3;
     size_t elemSize = channels * sizeof(float);
 
-    // Setup
-    Is[0] = I.clone();
-
-    cv::Size sz = I.size();
-    int width   = sz.width;
-    int height  = sz.height;
-
     unsigned int nSrcStep = width * elemSize;
 
     // Gradient params
@@ -77,13 +70,14 @@ namespace cu {
     std::cout << "[start] constructImgPyramids: processing "
       << width << "x" << height << " image" << std::endl;
 
+    Npp32f* pDeviceI = src;
+
     // Allocate device memory (to account for padding too
     auto start_cuda_malloc = now();
-    Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy;
+    Npp32f *pDeviceIx, *pDeviceIy;
     Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
     Npp32f *pDeviceTmp, *pDeviceKernel;
 
-    checkCudaErrors( cudaMalloc((void**) &pDeviceI,  width * height * elemSize) );
     checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
     checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );
 
@@ -99,12 +93,10 @@ namespace cu {
     // Copy over initial image and kernel
     auto start_memcpy_hd = now();
 
-    checkCudaErrors(
-        cudaMemcpy(pDeviceI, (float*) Is[0].data, width * height * elemSize, cudaMemcpyHostToDevice) );
     checkCudaErrors(
         cudaMemcpy(pDeviceKernel, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) );
 
-    calc_print_elapsed("cudaMemcpy I[0] H->D", start_memcpy_hd);
+    calc_print_elapsed("cudaMemcpy Kernel H->D", start_memcpy_hd);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////
     // Apply first gradients to Is[0]
@@ -153,6 +145,8 @@ namespace cu {
           pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
 
     // Pad dx, dy
+    checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
+    checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
     NPP_CHECK_NPP(
         nppiCopyConstBorder_32f_C3R (
           pDeviceIx, nSrcStep, oSize,
@@ -167,23 +161,23 @@ namespace cu {
     ////////////////////////////////////////////////////////////////////////////////////////////////
 
     auto start_cp_Is0 = now();
-    Is[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    Is[0] = new float[oPadSize.height * oPadSize.width * channels];
     checkCudaErrors(
-        cudaMemcpy(Is[0].data, pDevicePaddedI,
+        cudaMemcpy(Is[0], pDevicePaddedI,
           oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
     compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);
 
     auto start_cp_dx = now();
-    Ixs[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    Ixs[0] = new float[oPadSize.height * oPadSize.width * channels];
     checkCudaErrors(
-        cudaMemcpy(Ixs[0].data, pDevicePaddedIx,
+        cudaMemcpy(Ixs[0], pDevicePaddedIx,
           oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
     compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);
 
     auto start_cp_dy = now();
-    Iys[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    Iys[0] = new float[oPadSize.height * oPadSize.width * channels];
     checkCudaErrors(
-        cudaMemcpy(Iys[0].data, pDevicePaddedIy,
+        cudaMemcpy(Iys[0], pDevicePaddedIy,
           oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
     compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);
 
@@ -266,6 +260,8 @@ namespace cu {
             pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
 
       // Pad dx, dy
+      checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
+      checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
       NPP_CHECK_NPP(
           nppiCopyConstBorder_32f_C3R (
             pDeviceIx, nSrcStep, oSize,
@@ -277,28 +273,27 @@ namespace cu {
 
       // Allocate host destinations
       auto start_host_alloc = now();
-      Is[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
-      Ixs[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
-      Iys[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
+      Is[i] = new float[oPadSize.width * oPadSize.height * channels];
+      Ixs[i] = new float[oPadSize.width * oPadSize.height * channels];
+      Iys[i] = new float[oPadSize.width * oPadSize.height * channels];
       compute_time += calc_print_elapsed("host alloc", start_host_alloc);
 
       // Copy over data
       auto start_cp = now();
       checkCudaErrors(
-          cudaMemcpy(Is[i].data, pDevicePaddedI,
+          cudaMemcpy(Is[i], pDevicePaddedI,
             oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       checkCudaErrors(
-          cudaMemcpy(Ixs[i].data, pDevicePaddedIx,
+          cudaMemcpy(Ixs[i], pDevicePaddedIx,
             oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       checkCudaErrors(
-          cudaMemcpy(Iys[i].data, pDevicePaddedIy,
+          cudaMemcpy(Iys[i], pDevicePaddedIy,
             oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);
 
     }
 
     // Clean up
-    cudaFree(pDeviceI);
     cudaFree(pDeviceIx);
     cudaFree(pDeviceIy);
     cudaFree(pDevicePaddedI);
diff --git a/src/kernels/pyramid.h b/src/kernels/pyramid.h
index 3d529eb..c46d6ad 100644
--- a/src/kernels/pyramid.h
+++ b/src/kernels/pyramid.h
@@ -29,8 +29,8 @@
 namespace cu {
 
   void constructImgPyramids(
-      const cv::Mat& I,
-      cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
+      Npp32f* I, float** Is, float** Ixs, float** Iys,
+      int width, int height,
       int padding, int nLevels);
 
 }
diff --git a/src/oflow.cpp b/src/oflow.cpp
index 9106a8a..5f37d5b 100644
--- a/src/oflow.cpp
+++ b/src/oflow.cpp
@@ -12,6 +12,8 @@
 #include <opencv2/highgui/highgui.hpp> // needed for verbosity >= 3, DISVISUAL
 #include <opencv2/imgproc/imgproc.hpp> // needed for verbosity >= 3, DISVISUAL
 
+#include <nppi.h>
+
 #include <sys/time.h>    // timeof day
 #include <stdio.h>
 
@@ -60,58 +62,21 @@ namespace OFC {
     I1xs = new float*[op.coarsest_scale+1];
     I1ys = new float*[op.coarsest_scale+1];
 
-    I0_mats = new cv::Mat[op.coarsest_scale+1];
-    I1_mats = new cv::Mat[op.coarsest_scale+1];
-    I0x_mats = new cv::Mat[op.coarsest_scale+1];
-    I0y_mats = new cv::Mat[op.coarsest_scale+1];
-    I1x_mats = new cv::Mat[op.coarsest_scale+1];
-    I1y_mats = new cv::Mat[op.coarsest_scale+1];
-
   }
 
 
 
-  void OFClass::ConstructImgPyramids() {
+  void OFClass::ConstructImgPyramids(img_params iparams) {
 
     // Timing structures
     struct timeval start_time, end_time;
     gettimeofday(&start_time, NULL);
 
     // Construct image and gradient pyramides
-    cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.patch_size, op.coarsest_scale + 1);
-    cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.patch_size, op.coarsest_scale + 1);
-
-    auto start_pad = now();
-
-    // Pad images
-    for (int i = 0; i <= op.coarsest_scale; ++i) {
-
-      // Replicate padding for images
-      // cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, true);
-      // cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, true);
-      I0s[i] = (float*) I0_mats[i].data;
-      I1s[i] = (float*) I1_mats[i].data;
-
-      // Zero pad for gradients
-      // cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-      // cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-      // cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-      // cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-
-      I0xs[i] = (float*) I0x_mats[i].data;
-      I0ys[i] = (float*) I0y_mats[i].data;
-      I1xs[i] = (float*) I1x_mats[i].data;
-      I1ys[i] = (float*) I1y_mats[i].data;
-
-    }
-
-    calc_print_elapsed("pad images", start_pad);
+    cu::constructImgPyramids(I0, I0s, I0xs, I0ys, iparams.width, iparams.height,
+        op.patch_size, op.coarsest_scale + 1);
+    cu::constructImgPyramids(I1, I1s, I1xs, I1ys, iparams.width, iparams.height,
+        op.patch_size, op.coarsest_scale + 1);
 
     // Timing, image gradients and pyramid
     if (op.verbosity > 1) {
@@ -126,16 +91,15 @@ namespace OFC {
 
 
 
-  void OFClass::calc(cv::Mat _I0, cv::Mat _I1, img_params _iparams, const float * initflow, float * outflow) {
+  void OFClass::calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow) {
 
     I0 = _I0;
     I1 = _I1;
 
-    std::cout << "I0 " << I0.size() << " channels: " << I0.channels()
-      << " type: " << I0.type() << std::endl;
+    std::cout << "I0 " << _iparams.height << "x" << _iparams.width << std::endl;
 
     printf("Constructing pyramids\n");
-    ConstructImgPyramids();
+    ConstructImgPyramids(_iparams);
 
     if (op.verbosity > 1) cout << ", cflow " << endl;
 
@@ -185,7 +149,6 @@ namespace OFC {
 
     }
 
-
     // Timing, Grid memory allocation
     if (op.verbosity>1) {
 
diff --git a/src/oflow.h b/src/oflow.h
index a05855c..0c01ddb 100644
--- a/src/oflow.h
+++ b/src/oflow.h
@@ -3,6 +3,8 @@
 #ifndef OFC_HEADER
 #define OFC_HEADER
 
+#include <nppi.h>
+
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
@@ -76,19 +78,16 @@ namespace OFC {
 
     public:
       OFClass(opt_params _op);
-      void calc(cv::Mat _I0, cv::Mat _I1, img_params _iparams, const float * initflow, float * outflow);
+      void calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow);
 
     private:
-      void ConstructImgPyramids();
+      void ConstructImgPyramids(img_params iparams);
 
-      cv::Mat I0, I1;
+      Npp32f* I0, * I1;
 
       float ** I0s, ** I0xs, ** I0ys;
       float ** I1s, ** I1xs, ** I1ys;
 
-      cv::Mat * I0_mats, * I0x_mats, * I0y_mats;
-      cv::Mat * I1_mats, * I1x_mats, * I1y_mats;
-
       opt_params op;                     // Struct for optimization parameters
       std::vector<img_params> iparams;    // Struct (for each scale) for image parameter
 
diff --git a/src/run_dense.cpp b/src/run_dense.cpp
index fece28f..24f3c82 100644
--- a/src/run_dense.cpp
+++ b/src/run_dense.cpp
@@ -9,12 +9,14 @@
 #include "oflow.h"
 #include "kernels/warmup.h"
 #include "kernels/pad.h"
+#include "common/timer.h"
 
 // CUDA
 #include <cuda_runtime.h>
 
 using namespace std;
 using namespace OFC;
+using namespace timer;
 
 // Save a Depth/OF/SF as .flo file
 void SaveFlowFile(cv::Mat& img, const char* filename) {
@@ -127,9 +129,32 @@ int main( int argc, char** argv ) {
   // Load images
   I0_mat = cv::imread(I0_file, CV_LOAD_IMAGE_COLOR);   // Read the file
   I1_mat = cv::imread(I1_file, CV_LOAD_IMAGE_COLOR);   // Read the file
+
   int width_org = I0_mat.size().width;   // unpadded original image size
   int height_org = I0_mat.size().height;  // unpadded original image size
 
+  // convert to float
+  I0_mat.convertTo(I0_fmat, CV_32F);
+  I1_mat.convertTo(I1_fmat, CV_32F);
+
+  int channels = 3;
+  int elemSize = channels * sizeof(Npp32f);
+
+  /* MEMCOPY to CUDA */
+  Npp32f* I0, *I1;
+  auto start_cuda_malloc = now();
+  checkCudaErrors( cudaMalloc((void**) &I0, width_org * height_org * elemSize) );
+  checkCudaErrors( cudaMalloc((void**) &I1, width_org * height_org * elemSize) );
+  calc_print_elapsed("I0, I1 cudaMalloc", start_cuda_malloc);
+
+  auto start_memcpy_hd = now();
+  checkCudaErrors(
+      cudaMemcpy(I0, (float*) I0_fmat.data, width_org * height_org * elemSize, cudaMemcpyHostToDevice) );
+  checkCudaErrors(
+      cudaMemcpy(I1, (float*) I1_fmat.data, width_org * height_org * elemSize, cudaMemcpyHostToDevice) );
+  calc_print_elapsed("cudaMemcpy I0, I1 H->D", start_memcpy_hd);
+
+
   // Parse rest of parameters
   opt_params op;
 
@@ -196,9 +221,6 @@ int main( int argc, char** argv ) {
   }
 
 
-  // convert to float
-  I0_mat.convertTo(I0_fmat, CV_32F);
-  I1_mat.convertTo(I1_fmat, CV_32F);
 
   // Pad image such that width and height are restless divisible on all scales (except last)
   int padw = 0, padh = 0;
@@ -209,20 +231,29 @@ int main( int argc, char** argv ) {
   if (div > 0) padh = max_scale - div;
 
   if (padh > 0 || padw > 0) {
-
-    cu::pad(I0_fmat, I0_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
+    Npp32f* I0Padded = cu::pad(
+        I0, width_org, height_org, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
         floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true);
-    cu::pad(I1_fmat, I1_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
+
+    Npp32f* I1Padded = cu::pad(
+        I1, width_org, height_org, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
         floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true);
 
+    cudaFree(I0);
+    cudaFree(I1);
+
+    I0 = I0Padded;
+    I1 = I1Padded;
   }
 
+
+
   // Create image paramaters
   img_params iparams;
 
   // padded image size, ensures divisibility by 2 on all scales (except last)
-  iparams.width = I0_fmat.size().width;
-  iparams.height = I0_fmat.size().height;
+  iparams.width = width_org + padw;
+  iparams.height = height_org + padh;
   iparams.padding = op.patch_size;
 
 
@@ -243,7 +274,7 @@ int main( int argc, char** argv ) {
   float scale_fact = pow(2, op.finest_scale);
   cv::Mat flow_mat(iparams.height / scale_fact , iparams.width / scale_fact, CV_32FC2); // Optical Flow
 
-  ofc.calc(I0_fmat, I1_fmat, iparams, nullptr, (float*) flow_mat.data);
+  ofc.calc(I0, I1, iparams, nullptr, (float*) flow_mat.data);
 
 
   if (op.verbosity > 1) gettimeofday(&start_time, NULL);
diff --git a/src/sandbox/process_pad.cpp b/src/sandbox/process_pad.cpp
index e1253e4..9f21a9d 100644
--- a/src/sandbox/process_pad.cpp
+++ b/src/sandbox/process_pad.cpp
@@ -53,7 +53,8 @@ void process(const char* input_file, const char* output_file) {
 
   auto start_resize = now();
 
-  cu::pad(I0_f, I1_f, 20, 50, 100, 200, true);
+  I1_f.data = (uchar*) cu::pad((float*) I0_f.data, I0_f.size().width, I0_f.size().height,
+      20, 50, 100, 200, true);
 
   calc_print_elapsed("resize", start_resize);