diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d817b29..0e9a9f3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,7 +72,7 @@ set(CODEFILES
   FDF1.0.1/solver.c)
 
 # RGB, Optical Flow
-cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS})
+cuda_add_executable(flow ${COMMON} ${KERNELS} ${CODEFILES})
 set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1")
 set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image
 target_link_libraries(flow ${OpenCV_LIBS})
diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp
index 03bf9b1..b0b1b2c 100644
--- a/src/kernels/pyramid.cpp
+++ b/src/kernels/pyramid.cpp
@@ -31,7 +31,8 @@ namespace cu {
 
   void constructImgPyramids(
       Npp32f* src, float** Is, float** Ixs, float** Iys,
-      int width, int height,
+      Npp32f* pDeviceIx, Npp32f* pDeviceIy, Npp32f* pDeviceTmp,
+      Npp32f* pDeviceWew, int width, int height,
       int padding, int nLevels) {
 
     // Timing
@@ -51,7 +52,6 @@ namespace cu {
     NppiSize  oROI    = { width, height };
 
     // Mask params
-    const Npp32f pSrcKernel[3] = { 1, 0, -1 };
     Npp32s nMaskSize = 3;
     Npp32s nAnchor   = 1;  // Kernel is centered over pixel
 
@@ -72,27 +72,6 @@ namespace cu {
 
     Npp32f* pDeviceI = src;
 
-    // Allocate device memory (to account for padding too
-    auto start_cuda_malloc = now();
-    Npp32f *pDeviceIx, *pDeviceIy;
-    Npp32f *pDeviceTmp, *pDeviceKernel;
-
-    checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
-    checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );
-
-    checkCudaErrors( cudaMalloc((void**) &pDeviceTmp,    width * height * elemSize)  );
-    checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) );
-
-    calc_print_elapsed("cudaMalloc", start_cuda_malloc);
-
-    // Copy over initial image and kernel
-    auto start_memcpy_hd = now();
-
-    checkCudaErrors(
-        cudaMemcpy(pDeviceKernel, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) );
-
-    calc_print_elapsed("cudaMemcpy Kernel H->D", start_memcpy_hd);
-
     ////////////////////////////////////////////////////////////////////////////////////////////////
     // Apply first gradients to Is[0]
     ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -107,7 +86,7 @@ namespace cu {
         nppiFilterRowBorder_32f_C3R (
           pDeviceI, nSrcStep, oSize, oOffset,
           pDeviceIx, nSrcStep, oROI,
-          pDeviceKernel, nMaskSize, nAnchor, eBorderType)
+          pDeviceWew, nMaskSize, nAnchor, eBorderType)
         );
     compute_time += calc_print_elapsed("sobel: Ixs[0]", start_dx);
 
@@ -121,7 +100,7 @@ namespace cu {
         nppiFilterColumnBorder_32f_C3R (
           pDeviceI, nSrcStep, oSize, oOffset,
           pDeviceIy, nSrcStep, oROI,
-          pDeviceKernel, nMaskSize, nAnchor, eBorderType)
+          pDeviceWew, nMaskSize, nAnchor, eBorderType)
         );
     compute_time += calc_print_elapsed("sobel: Iys[0]", start_dy);
 
@@ -194,7 +173,7 @@ namespace cu {
           nppiFilterRowBorder_32f_C3R (
             pDeviceI, nSrcStep, oSize, oOffset,
             pDeviceIx, nSrcStep, oROI,
-            pDeviceKernel, nMaskSize, nAnchor, eBorderType)
+            pDeviceWew, nMaskSize, nAnchor, eBorderType)
           );
       compute_time += calc_print_elapsed("sobel: Ixs[i]", start_dx);
 
@@ -208,7 +187,7 @@ namespace cu {
           nppiFilterColumnBorder_32f_C3R (
             pDeviceI, nSrcStep, oSize, oOffset,
             pDeviceIy, nSrcStep, oROI,
-            pDeviceKernel, nMaskSize, nAnchor, eBorderType)
+            pDeviceWew, nMaskSize, nAnchor, eBorderType)
           );
       compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy);
 
@@ -239,12 +218,6 @@ namespace cu {
 
     }
 
-    // Clean up
-    cudaFree(pDeviceIx);
-    cudaFree(pDeviceIy);
-    cudaFree(pDeviceTmp);
-    cudaFree(pDeviceKernel);
-
     calc_print_elapsed("total time", start_total);
     std::cout << "[done] constructImgPyramids: primmary compute time: " << compute_time  << std::endl;
   }
diff --git a/src/kernels/pyramid.h b/src/kernels/pyramid.h
index c46d6ad..407ef52 100644
--- a/src/kernels/pyramid.h
+++ b/src/kernels/pyramid.h
@@ -30,7 +30,8 @@ namespace cu {
 
   void constructImgPyramids(
       Npp32f* I, float** Is, float** Ixs, float** Iys,
-      int width, int height,
+      Npp32f* pDeviceIx, Npp32f* pDeviceIy, Npp32f* pDeviceTmp,
+      Npp32f* pDeviceWew, int width, int height,
       int padding, int nLevels);
 
 }
diff --git a/src/oflow.cpp b/src/oflow.cpp
index 7050848..b8ea919 100644
--- a/src/oflow.cpp
+++ b/src/oflow.cpp
@@ -18,15 +18,12 @@
 #include <stdio.h>
 
 #include "oflow.h"
-#include "patchgrid.h"
-#include "refine_variational.h"
 
 #include "kernels/resize.h"
 #include "kernels/pad.h"
 #include "kernels/resizeGrad.h"
 #include "kernels/sobel.h"
 #include "kernels/pyramid.h"
-#include "common/RgbMat.h"
 #include "common/timer.h"
 
 
@@ -76,6 +73,7 @@ namespace OFC {
     if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);
 
 
+    int elemSize = 3 * sizeof(float);
     grid.resize(op.n_scales);
     flow.resize(op.n_scales);
     iparams.resize(op.n_scales);
@@ -100,7 +98,6 @@ namespace OFC {
         grid[i]   = new OFC::PatGridClass(&(iparams[i]), &op);
       }
 
-      int elemSize = 3 * sizeof(float);
       int padWidth = _iparams.width * scale_fact + 2 * _iparams.padding;
       int padHeight = _iparams.height * scale_fact + 2 * _iparams.padding;
 
@@ -122,6 +119,18 @@ namespace OFC {
 
     }
 
+    const Npp32f pSrcKernel[3] = { 1, 0, -1 };
+    Npp32s nMaskSize = 3;
+
+    checkCudaErrors( cudaMalloc((void**) &pDeviceIx, _iparams.width * _iparams.height * elemSize) );
+    checkCudaErrors( cudaMalloc((void**) &pDeviceIy, _iparams.width * _iparams.height * elemSize) );
+
+    checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, _iparams.width * _iparams.height * elemSize)  );
+    checkCudaErrors( cudaMalloc((void**) &pDeviceWew, nMaskSize * sizeof(Npp32f)) );
+
+    checkCudaErrors(
+        cudaMemcpy(pDeviceWew, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) );
+
     // Timing, Setup
     if (op.verbosity>1) {
 
@@ -160,6 +169,10 @@ namespace OFC {
     delete I1xs;
     delete I1ys;
 
+    cudaFree(pDeviceIx);
+    cudaFree(pDeviceIy);
+    cudaFree(pDeviceTmp);
+    cudaFree(pDeviceWew);
   }
 
 
@@ -170,9 +183,13 @@ namespace OFC {
     gettimeofday(&start_time, NULL);
 
     // Construct image and gradient pyramides
-    cu::constructImgPyramids(I0, I0s, I0xs, I0ys, iparams.width, iparams.height,
+    cu::constructImgPyramids(I0, I0s, I0xs, I0ys,
+        pDeviceIx, pDeviceIy, pDeviceTmp, pDeviceWew,
+        iparams.width, iparams.height,
         op.patch_size, op.coarsest_scale + 1);
-    cu::constructImgPyramids(I1, I1s, I1xs, I1ys, iparams.width, iparams.height,
+    cu::constructImgPyramids(I1, I1s, I1xs, I1ys,
+        pDeviceIx, pDeviceIy, pDeviceTmp, pDeviceWew,
+        iparams.width, iparams.height,
         op.patch_size, op.coarsest_scale + 1);
 
     // Timing, image gradients and pyramid
diff --git a/src/oflow.h b/src/oflow.h
index d924f1a..c553c76 100644
--- a/src/oflow.h
+++ b/src/oflow.h
@@ -12,6 +12,7 @@
 #include <opencv2/imgproc/imgproc.hpp>
 #include "params.h"
 #include "patchgrid.h"
+#include "refine_variational.h"
 
 using std::cout;
 using std::endl;
@@ -39,6 +40,9 @@ namespace OFC {
 
       std::vector<PatGridClass*> grid;
       std::vector<float*> flow;
+
+      // Temp images to speedup pyramid generation
+      Npp32f* pDeviceIx, *pDeviceIy, *pDeviceTmp, *pDeviceWew;
   };
 
 }
diff --git a/src/params.h b/src/params.h
index cd51f9c..bee846c 100644
--- a/src/params.h
+++ b/src/params.h
@@ -1,7 +1,7 @@
 // Holds all of the paramaters structures
 
-#ifndef PARAMS_HEADER
-#define PARAMS_HEADER
+#ifndef OFC_PARAMS_HEADER
+#define OFC_PARAMS_HEADER
 
 namespace OFC {
 
@@ -68,4 +68,4 @@ namespace OFC {
 
 }
 
-#endif /* PARAMS_HEADER */
+#endif /* OFC_PARAMS_HEADER */
diff --git a/src/run_dense.cpp b/src/run_dense.cpp
index a6655d9..b307080 100644
--- a/src/run_dense.cpp
+++ b/src/run_dense.cpp
@@ -6,14 +6,16 @@
 #include <sys/time.h>
 #include <fstream>
 
+// CUDA
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+#include "params.h"
 #include "oflow.h"
 #include "kernels/warmup.h"
 #include "kernels/pad.h"
 #include "common/timer.h"
 
-// CUDA
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
 
 using namespace std;
 using namespace OFC;