diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..813e4f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.sw*
+*.bak*
+build
+build/*
diff --git a/ROIWarping.cu b/ROIWarping.cu
new file mode 100644
index 0000000..23518ca
--- /dev/null
+++ b/ROIWarping.cu
@@ -0,0 +1,595 @@
+// ------------------------------------------------------------------
+// Fast R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Ross Girshick
+// ------------------------------------------------------------------
+
+// Torch port:
+// IMAGINE, Sergey Zagoruyko, Francisco Massa, 2015
+
+#include "THC.h"
+#include <algorithm>
+#include <cfloat>
+#include "assert.h"
+
+#include "common.h"
+
+
+using std::max;
+using std::min;
+
+
+template <typename Dtype>
+__global__ void ROIWarpForward(const int nthreads, const Dtype* bottom_data,
+    const Dtype spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const Dtype* bottom_rois, const Dtype* bottom_delta_rois, Dtype* top_data, Dtype* top_data_buffer) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    bottom_rois += n * 5;
+    int roi_batch_ind = (bottom_rois[0] - 1);
+    //int roi_start_w = round((bottom_rois[1] - 1) * spatial_scale);
+    //int roi_start_h = round((bottom_rois[2] - 1)* spatial_scale);
+    //int roi_end_w = round((bottom_rois[3] - 1) * spatial_scale);
+    //int roi_end_h = round((bottom_rois[4] - 1) * spatial_scale);
+
+    Dtype src_w = bottom_rois[3] - bottom_rois[1] + 1; 
+    Dtype src_h = bottom_rois[4] - bottom_rois[2] + 1;
+    Dtype src_ctr_x = bottom_rois[1] + 0.5*(src_w-1.0); 
+    Dtype src_ctr_y = bottom_rois[2] + 0.5*(src_h-1.0); 
+
+    Dtype dst_ctr_x = bottom_delta_rois[1]; // dx (in fast-rcnn notation) = cx (in here)
+    Dtype dst_ctr_y = bottom_delta_rois[2]; // dy (in fast-rcnn notation) = cy (in here) 
+    Dtype dst_scl_x = bottom_delta_rois[3]; // dw (in fast-rcnn notation) = sx (in here)
+    Dtype dst_scl_y = bottom_delta_rois[4]; // dh (in fast-rcnn notation) = sy (in here) 
+
+    Dtype pred_ctr_x = dst_ctr_x * src_w + src_ctr_x; 
+    Dtype pred_ctr_y = dst_ctr_y * src_h + src_ctr_y; 
+    Dtype pred_w = exp(dst_scl_x) * src_w;            
+    Dtype pred_h = exp(dst_scl_y) * src_h;            
+    
+    Dtype roi_start_w = ( (pred_ctr_x - 0.5*(pred_w-1)) - 1 ) * spatial_scale; 
+    Dtype roi_start_h = ( (pred_ctr_y - 0.5*(pred_h-1)) - 1 ) * spatial_scale; 
+    Dtype roi_end_w =   ( (pred_ctr_x + 0.5*(pred_w-1)) - 1 ) * spatial_scale; 
+    Dtype roi_end_h =   ( (pred_ctr_y + 0.5*(pred_h-1)) - 1 ) * spatial_scale; 
+    assert(roi_end_w - roi_start_w >= 0);
+    assert(roi_end_h - roi_start_h >= 0);   
+    
+    // Force malformed ROIs to be 1x1
+    Dtype roi_width  = roi_end_w - roi_start_w + 1; 
+    Dtype roi_height = roi_end_h - roi_start_h + 1; 
+    
+    Dtype bin_size_w = roi_width  / static_cast<Dtype>(pooled_width);  
+    Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height); 
+    Dtype wstart_ = static_cast<Dtype>(pw) * bin_size_w + roi_start_w;
+    Dtype hstart_ = static_cast<Dtype>(ph) * bin_size_h + roi_start_h;
+    Dtype wend_ = static_cast<Dtype>(pw+1) * bin_size_w + roi_start_w;
+    Dtype hend_ = static_cast<Dtype>(ph+1) * bin_size_h + roi_start_h;
+                                                                       
+    int wstart = static_cast<int>(floor(wstart_)); 
+    int hstart = static_cast<int>(floor(hstart_)); 
+    int wend   = static_cast<int>( ceil(wend_)); 
+    int hend   = static_cast<int>( ceil(hend_));
+ 
+    Dtype wctr =  (wend_ + wstart_) * 0.5; // dwctr / dwe = 0.5; dwctr / dws = 0.5
+    Dtype hctr =  (hend_ + hstart_) * 0.5; // dhctr / dhe = 0.5; dhctr / dhs = 0.5
+    Dtype wdiff = (wend_ - wstart_) + 1;   // dwdiff / dwe = 1; dwdiff / dws = -1
+    Dtype hdiff = (hend_ - hstart_) + 1;   // dhdiff / dhe = 1; dhdiff / dhs = -1
+
+    //top_data[index] = static_cast<Dtype>(hend-1-hstart)+1;
+    //top_data[index] = hend; //wend;
+    //top_data[index] = hstart+1; // wstart+1;
+    //top_data[index] = wdiff;
+    //top_data[index] = hctr+1;
+    //top_data[index] = wctr+1;
+   
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend   = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend   = min(max(wend, 0), width);
+
+    //top_data[index] = hstart+1; 
+    //top_data[index] = wstart+1;
+
+    // Auxilliary variables used in backprop 
+    Dtype w_mask = 0, h_mask = 0; 
+    Dtype dgx_final_dwctr_all  = 0;
+    Dtype dgx_final_dwdiff_all = 0;
+    Dtype dgy_final_dhctr_all  = 0;
+    Dtype dgy_final_dhdiff_all = 0; 
+    // Define an empty pooling region to be zero
+    Dtype val = 0; Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0;   
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      Dtype h_ = h; 
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        Dtype w_ = w;  
+        //gain_x = max(0., 1 - abs( dst_ctr_x + static_cast<Dtype>(pw) / static_cast<Dtype>(pooled_width) * dst_scl_x - w )); -- in paper, but makes no sense
+        //gain_y = max(0., 1 - abs( dst_ctr_y + static_cast<Dtype>(ph) / static_cast<Dtype>(pooled_height) * dst_scl_y - h));
+        gain_x = wdiff - abs((w_ - wctr)); 
+        gain_y = hdiff - abs((h_ - hctr));   
+        gain = gain_x * gain_y;
+
+        val = val + gain * bottom_data[bottom_index];
+        //val = val + gain;
+        //val = val + 1;
+
+        if (h == hstart) { 
+          gain_x_all = gain_x_all + gain_x;
+
+          // Update information used in backprop
+          w_mask = w_ >= wctr ? 1 : -1;
+          dgx_final_dwctr_all  = dgx_final_dwctr_all  + w_mask;
+          dgx_final_dwdiff_all = dgx_final_dwdiff_all + 1;
+        }
+      }
+      gain_y_all = gain_y_all + gain_y;
+        
+      h_mask = h >= hctr ? 1 : -1;
+      dgy_final_dhctr_all  = dgy_final_dhctr_all  + h_mask;
+      dgy_final_dhdiff_all = dgy_final_dhdiff_all + 1;
+    }
+    if (gain_x_all > 1e-10)
+      val = val / gain_x_all;
+    if (gain_y_all > 1e-10)  
+      val = val / gain_y_all;
+    top_data[index] = val;
+
+    //top_data[index] = gain_x_all; 
+    //top_data[index] = gain_y_all; 
+    int buffer_index = n * (channels * pooled_height * pooled_width * 10) + c * (pooled_height * pooled_width * 10) + ph * (pooled_width * 10) + pw * 10;
+    top_data_buffer[buffer_index+0] = wctr;
+    top_data_buffer[buffer_index+1] = wdiff;
+    top_data_buffer[buffer_index+2] = hctr;
+    top_data_buffer[buffer_index+3] = hdiff; 
+    top_data_buffer[buffer_index+4] = gain_x_all; 
+    top_data_buffer[buffer_index+5] = gain_y_all;
+    top_data_buffer[buffer_index+6] = dgx_final_dwctr_all;
+    top_data_buffer[buffer_index+7] = dgy_final_dhctr_all;
+    top_data_buffer[buffer_index+8] = dgx_final_dwdiff_all;
+    top_data_buffer[buffer_index+9] = dgy_final_dhdiff_all;
+  }
+}
+
+extern "C"
+void inn_ROIWarping_updateOutput(THCState *state,
+    THCudaTensor *output, THCudaTensor *output_buffer,
+    THCudaTensor *data, THCudaTensor* rois, THCudaTensor* delta_rois, int W, int H, double spatial_scale)
+{
+  THAssert(THCudaTensor_nDimension(state, data) == 4);
+  THAssert(THCudaTensor_nDimension(state, rois) == 2 && rois->size[1] == 5);
+  THAssert(THCudaTensor_nDimension(state, delta_rois) == 2 && delta_rois->size[1] == 5);
+  THAssert(THCudaTensor_nDimension(state, rois) == THCudaTensor_nDimension(state, delta_rois) &&
+           rois->size[0] == delta_rois->size[0] &&
+           rois->size[1] == delta_rois->size[1]);
+  THAssert(THCudaTensor_isContiguous(state, data));
+  THAssert(THCudaTensor_isContiguous(state, rois));
+  THAssert(THCudaTensor_isContiguous(state, delta_rois));
+  long num_rois = rois->size[0];
+  long nInputPlane = data->size[1];
+  THCudaTensor_resize4d(state, output, num_rois, nInputPlane, H, W);
+  THCudaTensor_resize5d(state, output_buffer, num_rois, nInputPlane, H, W, 10);
+  THCudaTensor_zero(state, output_buffer);
+
+  long count = THCudaTensor_nElement(state, output);
+
+  ROIWarpForward<float><<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count,
+      THCudaTensor_data(state, data),
+      spatial_scale, nInputPlane, data->size[2], data->size[3], H, W,
+      THCudaTensor_data(state, rois),
+      THCudaTensor_data(state, delta_rois),
+      THCudaTensor_data(state, output),
+      THCudaTensor_data(state, output_buffer)
+      );
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in inn_ROIWarping_updateOutput: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
+
+template <typename Dtype>
+__global__ void ROIWarpBackwardData(const int nthreads, const Dtype* top_data_buffer,
+    const Dtype spatial_scale, const int channels, const int height, const int width, 
+    const int pooled_height, const int pooled_width, const int nth_roi, 
+    const Dtype* bottom_rois, const Dtype* bottom_delta_rois, 
+    const Dtype* top_diff,
+    Dtype* bottom_diff_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    // (n, c, h, w) is an element in the input 
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    bottom_rois += nth_roi * 5;
+    int roi_batch_ind = (bottom_rois[0] - 1);
+ 
+    if (roi_batch_ind == n) {
+
+      Dtype src_w = bottom_rois[3] - bottom_rois[1] + 1;
+      Dtype src_h = bottom_rois[4] - bottom_rois[2] + 1;
+      Dtype src_ctr_x = bottom_rois[1] + 0.5*(src_w-1.0);
+      Dtype src_ctr_y = bottom_rois[2] + 0.5*(src_h-1.0);
+  
+      Dtype dst_ctr_x = bottom_delta_rois[1]; // dx (in fast-rcnn notation) = cx (in here)
+      Dtype dst_ctr_y = bottom_delta_rois[2]; // dy (in fast-rcnn notation) = cy (in here)
+      Dtype dst_scl_x = bottom_delta_rois[3]; // dw (in fast-rcnn notation) = sx (in here)
+      Dtype dst_scl_y = bottom_delta_rois[4]; // dh (in fast-rcnn notation) = sy (in here)
+  
+      Dtype pred_ctr_x = dst_ctr_x * src_w + src_ctr_x; // dpcx / dcx = src_w
+      Dtype pred_ctr_y = dst_ctr_y * src_h + src_ctr_y; // dpcy / dcy = src_h
+      Dtype pred_w = exp(dst_scl_x) * src_w;            // dpw  / dsx = src_w * exp(dsx)
+      Dtype pred_h = exp(dst_scl_y) * src_h;            // dph  / dsy = src_h * exp(dsy)
+  
+      Dtype roi_start_w = ( (pred_ctr_x - 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drsw / dpcx = spatial_scale; drsw / dpw = -0.5 * spatial_scale
+      Dtype roi_start_h = ( (pred_ctr_y - 0.5*(pred_h-1)) - 1 ) * spatial_scale; // drsh / dpcy = spatial_scale; drsh / dph = -0.5 * spatial_scale
+      Dtype roi_end_w =   ( (pred_ctr_x + 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drew / dpcx = spatial_scale; drew / dpw =  0.5 * spatial_scale
+      Dtype roi_end_h =   ( (pred_ctr_y + 0.5*(pred_h-1)) - 1 ) * spatial_scale; // dreh / dpcy = spatial_scale; dreh / dph =  0.5 * spatial_scale
+      assert(roi_end_w - roi_start_w >= 0);
+      assert(roi_end_h - roi_start_h >= 0);
+  
+      Dtype roi_width  = roi_end_w - roi_start_w + 1;
+      Dtype roi_height = roi_end_h - roi_start_h + 1;
+  
+      Dtype bin_size_pw = static_cast<Dtype>(pooled_width)  / roi_width;  
+      Dtype bin_size_ph = static_cast<Dtype>(pooled_height) / roi_height; 
+  
+      int pwstart = static_cast<int>(floor(static_cast<Dtype>(-roi_start_w + w) * bin_size_pw)); 
+      int phstart = static_cast<int>(floor(static_cast<Dtype>(-roi_start_h + h) * bin_size_ph)); 
+      int pwend = static_cast<int>(ceil(static_cast<Dtype>(-roi_start_w + w+1) * bin_size_pw));
+      int phend = static_cast<int>(ceil(static_cast<Dtype>(-roi_start_h + h+1) * bin_size_ph)); 
+   
+      //bottom_diff_data[index] = pwend; //phend; 
+      //bottom_diff_data[index] = pwstart+1; //phend; 
+  
+      // Clip to top boundaries
+      phstart = min(max(phstart, 0), pooled_height);         
+      phend =   min(max(phend, 0),   pooled_height);
+      pwstart = min(max(pwstart, 0), pooled_width);
+      pwend =   min(max(pwend, 0),   pooled_width);
+  
+      Dtype w_ = w, h_ = h; 
+      Dtype wctr = 0, wdiff = 0, hctr = 0, hdiff = 0;
+      Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0;  
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          int top_index = nth_roi * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width  + pw;
+          int top_buffer_index = nth_roi * (channels * pooled_height * pooled_width * 10) + c * (pooled_height * pooled_width * 10) + ph * (pooled_width * 10) + pw * 10;
+          wctr       = top_data_buffer[top_buffer_index+0]; 
+          wdiff      = top_data_buffer[top_buffer_index+1]; 
+          hctr       = top_data_buffer[top_buffer_index+2]; 
+          hdiff      = top_data_buffer[top_buffer_index+3]; 
+          gain_x_all = top_data_buffer[top_buffer_index+4]; 
+          gain_y_all = top_data_buffer[top_buffer_index+5]; 
+  
+          gain_x = wdiff - abs((w_ - wctr));   // dgx / dwdiff =   1  
+                                               // dgx / dwctr  =   1 ( if w >= wctr )
+                                               // dgx / dwctr  = - 1 ( else )
+          gain_y = hdiff - abs((h_ - hctr));   // dgy / dhdiff =   1
+                                               // dgy / dhctr  =   1 ( if h >= hctr )
+                                               // dgy / dhctr  = - 1 ( else )
+          if (gain_x_all > 1e-10) 
+            gain_x = gain_x / gain_x_all; 
+          if (gain_y_all > 1e-10)  
+            gain_y = gain_y / gain_y_all; 
+  
+          gain = gain_x * gain_y;
+          bottom_diff_data[index] = bottom_diff_data[index] + gain * top_diff[top_index]; //val = val + gain * bottom_data[bottom_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void ROIWarpBackwardDeltaROI(const int nthreads, const Dtype* top_data_buffer,
+    const Dtype spatial_scale, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, 
+    const Dtype* bottom_rois, const Dtype* bottom_delta_rois,
+    const Dtype* top_diff,
+    const Dtype* bottom_data,
+    Dtype* bottom_diff_delta_rois_buffer) {
+  CUDA_KERNEL_LOOP(index, nthreads) { 
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    int buffer_index = n * (channels * pooled_height * pooled_width * 10) + c * (pooled_height * pooled_width * 10) + ph * (pooled_width * 10) + pw * 10; 
+    Dtype wctr                 = top_data_buffer[buffer_index+0];
+    Dtype wdiff                = top_data_buffer[buffer_index+1];
+    Dtype hctr                 = top_data_buffer[buffer_index+2];
+    Dtype hdiff                = top_data_buffer[buffer_index+3];
+    Dtype gain_x_all           = top_data_buffer[buffer_index+4];
+    Dtype gain_y_all           = top_data_buffer[buffer_index+5];
+    Dtype dgx_final_dwctr_all  = top_data_buffer[buffer_index+6];
+    Dtype dgy_final_dhctr_all  = top_data_buffer[buffer_index+7];
+    Dtype dgx_final_dwdiff_all = top_data_buffer[buffer_index+8];
+    Dtype dgy_final_dhdiff_all = top_data_buffer[buffer_index+9];
+
+    //if (gain_x_all > 1e-10 && gain_y_all > 1e-10) {
+
+      bottom_rois += n * 5;
+      int roi_batch_ind = (bottom_rois[0] - 1);
+
+      Dtype src_w = bottom_rois[3] - bottom_rois[1] + 1; 
+      Dtype src_h = bottom_rois[4] - bottom_rois[2] + 1;
+      Dtype src_ctr_x = bottom_rois[1] + 0.5*(src_w-1.0); 
+      Dtype src_ctr_y = bottom_rois[2] + 0.5*(src_h-1.0); 
+
+      Dtype dst_ctr_x = bottom_delta_rois[1]; // dx (in fast-rcnn notation) = cx (in here)
+      Dtype dst_ctr_y = bottom_delta_rois[2]; // dy (in fast-rcnn notation) = cy (in here) 
+      Dtype dst_scl_x = bottom_delta_rois[3]; // dw (in fast-rcnn notation) = sx (in here)
+      Dtype dst_scl_y = bottom_delta_rois[4]; // dh (in fast-rcnn notation) = sy (in here) 
+
+      Dtype pred_ctr_x = dst_ctr_x * src_w + src_ctr_x; // dpcx / dcx = src_w
+      Dtype pred_ctr_y = dst_ctr_y * src_h + src_ctr_y; // dpcy / dcy = src_h
+      Dtype pred_w = exp(dst_scl_x) * src_w;            // dpw  / dsx = src_w * exp(dsx)  
+      Dtype pred_h = exp(dst_scl_y) * src_h;            // dph  / dsy = src_h * exp(dsy)  
+      
+      Dtype roi_start_w = ( (pred_ctr_x - 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drsw / dpcx =       spatial_scale 
+                                                                                 // drsw / dpw = -0.5 * spatial_scale
+      Dtype roi_start_h = ( (pred_ctr_y - 0.5*(pred_h-1)) - 1 ) * spatial_scale; // drsh / dpcy =       spatial_scale 
+                                                                                 // drsh / dph = -0.5 * spatial_scale
+      Dtype roi_end_w =   ( (pred_ctr_x + 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drew / dpcx =       spatial_scale 
+                                                                                 // drew / dpw =  0.5 * spatial_scale
+      Dtype roi_end_h =   ( (pred_ctr_y + 0.5*(pred_h-1)) - 1 ) * spatial_scale; // dreh / dpcy =       spatial_scale 
+                                                                                 // dreh / dph =  0.5 * spatial_scale
+      assert(roi_end_w - roi_start_w >= 0); 
+      assert(roi_end_h - roi_start_h >= 0); 
+      
+      // drsw / dcx = drsw / dpcx * dpcx / dcx = spatial_scale * src_w
+      // drew / dcx = drew / dpcx * dpcx / dcx = spatial_scale * src_w
+
+      // drsh / dcy = drsh / dpcy * dpcy / dcy = spatial_scale * src_h
+      // dreh / dcy = dreh / dpcy * dpcy / dcy = spatial_scale * src_h
+
+      // drsw / dsx = drsw / dpw * dpw / dsx = -0.5 * spatial_scale * src_w * exp(dsx) 
+      // drew / dsx = drew / dpw * dpw / dsx =  0.5 * spatial_scale * src_w * exp(dsx)
+ 
+      // drsh / dsy = drsh / dph * dph / dsy = -0.5 * spatial_scale * src_h * exp(dsy)
+      // dreh / dsy = dreh / dph * dph / dsy =  0.5 * spatial_scale * src_h * exp(dsy) 
+ 
+      // Force malformed ROIs to be 1x1
+      Dtype roi_width  = roi_end_w - roi_start_w + 1; // drw / drew =  1 
+                                                      // drw / drsw = -1
+      Dtype roi_height = roi_end_h - roi_start_h + 1; // drh / dreh =  1 
+                                                      // drh / drsh = -1 
+      // drw / dcx = drw / drew * drew / dcx + drw / drsw * drsw / dcx = drew / dcx - drsw / dcx
+      //           = spatial_scale * src_w - spatial_scale * src_w = 0 
+      // drh / dcy = drh / dreh * dreh / dcy + drh / drsh * drsh / dcy = dreh / dcy - drsh / dcy = spatial_scale * src_h - spatial_scale * src_h = 0 
+      // drw / dsx = drw / drew * drew / dsx + drw / drsw * drsw / dsx = drew / dsx - drsw / dsx = 0.5 * spatial_scale * src_w * exp(dsx) - (-0.5 * spatial_scale * src_w * exp(dsx)) = spatial_scale * src_w * exp(dsx) 
+      // drh / dsy = drh / dreh * dreh / dsy + drh / drsh * drsh / dsy = dreh / dsy - drsh / dsy = 0.5 * spatial_scale * src_h * exp(dsy) - (-0.5 * spatial_scale * src_h * exp(dsy)) = spatial_scale * src_h * exp(dsy) 
+
+      Dtype bin_size_w = roi_width  / static_cast<Dtype>(pooled_width);  // dbw / drw  =  1 / pooled_width
+      Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height); // dbh / drh  =  1 / pooled_height
+      // dbw / dcx = dbw / drw * drw / dcx = 0 
+      // dbh / dcy = dbh / drh * drh / dcy = 0
+      // dbw / dsx = dbw / drw * drw / dsx = 1 / pooled_width  * spatial_scale * src_w * exp(dsx) 
+      // dbh / dsy = dbh / drh * drh / dsy = 1 / pooled_height * spatial_scale * src_h * exp(dsy) 
+
+      Dtype wstart_ = static_cast<Dtype>(pw) * bin_size_w + roi_start_w; // ws = f(rsw, rew)
+      Dtype hstart_ = static_cast<Dtype>(ph) * bin_size_h + roi_start_h; // hw = f(rsh, reh)
+      Dtype wend_ = static_cast<Dtype>(pw+1) * bin_size_w + roi_start_w; // we = f(rsw, rew)
+      Dtype hend_ = static_cast<Dtype>(ph+1) * bin_size_h + roi_start_h; // he = f(rsh, reh)
+      // dws / dbw = pw 
+      // dhs / dbh = ph 
+      // dwe / dbw = (pw+1)
+      // dhe / dbh = (ph+1)
+                                                                        
+      int wstart = static_cast<int>(floor(wstart_));
+      int hstart = static_cast<int>(floor(hstart_));
+      int wend   = static_cast<int>( ceil(wend_));
+      int hend   = static_cast<int>( ceil(hend_));
+
+      // dws / dcx = dws / drsw * drsw / dcx + dws / drew * drew / dcx
+      //           = (dws / dbw * dbw / drsw + 1) * drsw / dcx + (dws / dbw * dbw / drew) * drew / dcx
+      //           = (pw * 1 / pooled_width * (-1) + 1) * spatial_scale * src_w 
+      //           + (pw * 1 / pooled_width * ( 1)    ) * spatial_scale * src_w 
+      //           = spatial_scale * src_w
+      // dwe / dcx = dwe / drsw * drsw / dcx + dwe / drew * drew / dcx
+      //           = (dwe / dbw * dbw / drsw + 1) * drsw / dcx + (dwe / dbw * dbw / drew) * drew / dcx 
+      //           = ((pw+1) * 1 / pooled_width * (-1) + 1) * spatial_scale * src_w
+      //           + ((pw+1) * 1 / pooled_width * ( 1)    ) * spatial_scale * src_w
+      //           = spatial_scale * src_w
+
+      // dws / dsx = dws / drsw * drsw / dsx + dws / drew * drew / dsx 
+      //           = (dws / dbw * dbw / drsw + 1) * drsw / dsx + (dws / dbw * dbw / drew) * drew / dsx
+      //           = (pw * 1 / pooled_width * (-1) + 1) * (-0.5 * spatial_scale * src_w * exp(dsx))
+      //           + (pw * 1 / pooled_width * ( 1)    ) * ( 0.5 * spatial_scale * src_w * exp(dsx))
+      //           = (pw * 1 / pooled_width - 0.5) * spatial_scale * src_w * exp(dsx)  
+      // dwe / dsx = dwe / drsw * drsw / dsx + dwe / drew * drew / dsx
+      //           = (dwe / dbw * dbw / drsw + 1) * drsw / dsx + (dwe / dbw * dbw / drew) * drew / dsx
+      //           = ((pw+1) * 1 / pooled_width * (-1) + 1) * (-0.5 * spatial_scale * src_w * exp(dsx)) 
+      //           + ((pw+1) * 1 / pooled_width * ( 1)    ) * ( 0.5 * spatial_scale * src_w * exp(dsx)) 
+      //           = ((pw+1) * 1 / pooled_width - 0.5) * spatial_scale * src_w * exp(dsx)
+
+      // dws / dcy = spatial_scale * src_h
+      // dwe / dcy = spatial_scale * src_h
+      // dws / dsy = (   ph  * 1 / pooled_height - 1) * spatial_scale * src_h * exp(dsy)
+      // dwe / dsy = ((ph+1) * 1 / pooled_height - 1) * spatial_scale * src_h * exp(dsy)
+      /*
+      Dtype wctr =  (wend_ + wstart_) * 0.5; // dwctr / dwe = 0.5; dwctr / dws = 0.5
+      Dtype hctr =  (hend_ + hstart_) * 0.5; // dhctr / dhe = 0.5; dhctr / dhs = 0.5
+      Dtype wdiff = (wend_ - wstart_) + 1;   // dwdiff / dwe = 1; dwdiff / dws = -1
+      Dtype hdiff = (hend_ - hstart_) + 1;   // dhdiff / dhe = 1; dhdiff / dhs = -1
+
+      // dwctr  / dcx = dwctr  / dwe * dwe / dcx + dwctr  / dws * dws / dcx = 0.5 * spatial_scale * src_w + 0.5 * spatial_scale * src_w = spatial_scale * src_w 
+      // dwdiff / dcx = dwdiff / dwe * dwe / dcx + dwdiff / dws * dws / dcx =   1 * spatial_scale * src_w -  1  * spatial_scale * src_w = 0 
+      // dhctr  / dcy = spatial_scale * src_h
+      // dhdiff / dcy = 0
+  
+      // dwctr  / dsx = dwctr / dwe * dwe / dsx + dwctr / dws * dws / dsx 
+      //              = 0.5 * ((pw+1)/pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) 
+      //              + 0.5 * ( pw   /pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) 
+      //              = 0.5 * ((2*pw+1)/pooled_width - 1) * spatial_scale * src_w * exp(dsx)
+      //              = ((pw + 0.5) / pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) 
+      // dwdiff / dsx = dwdiff / dwe * dwe / dsx + dwdiff / dws * dws / dsx 
+      //              = 1  * ((pw+1)/pooled_width - 0.5) * spatial_scale * src_w * exp(dsx)
+      //            + (-1) * ( pw   /pooled_width - 0.5) * spatial_scale * src_w * exp(dsx)
+      //              = (1 / pooled_width) * spatial_scale * src_w * exp(dsx) 
+      // dhctr  / dsy = ((ph + 0.5) / pooled_height - 0.5) * spatial_scale * src_h * exp(dsy)
+      // dhdiff / dsy = (1 / pooled_height) * spatial_scale * src_h * exp(dsy) 
+
+
+      // dgx / dwctr  = (w >= wctr ? 1 : -1)  
+      // dgx / dwdiff = 1 
+      // dgy / dhctr  = (h >= hctr ? 1 : -1)  
+      // dgy / dhdiff = 1
+ 
+      // gx_final = gx / gx_all 
+      // dgx_final / dwctr  = ( dgx/dwctr  * gx_all - gx * dgx_all/dwctr  ) / (gx_all)^2 = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2
+      // dgx_final / dwdiff = ( dgx/dwdiff * gx_all - gx * dgx_all/dwdiff ) / (gx_all)^2 = (       1              * gx_all - gx * sum_for_w{          1           } ) / gx_all^2
+      // gy_final = gy / gy_all
+      // dgy_final / dhctr  = ...
+      // dgy_final / dhdiff = ...
+
+      // dgx_final / dcx = dgx_final / dwctr * dwctr / dcx + dgx_final / dwdiff * dwdiff / dcx
+      //                 = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 * spatial_scale * src_w + (...) * 0
+      //                 = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 * spatial_scale * src_w 
+      // dgy_final / dcy = ( (h >= hctr ? 1 : -1) * gy_all - gy * sum_for_h{ (h >= hctr ? 1 : -1) } ) / gx_all^2 * spatial_scale * src_h
+      // dgx_final / dsx = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 * ((pw + 0.5) - 0.5 * pooled_width)  / pooled_width  * spatial_scale * src_w * exp(dsx) + 
+      //                   (           1          * gx_all - gx * sum_for_w{         1            } ) / gx_all^2 *     1                              / pooled_width  * spatial_scale * src_w * exp(dsx) 
+      // dgy_final / dsy = ( (h >= hctr ? 1 : -1) * gy_all - gy * sum_for_h{ (h >= hctr ? 1 : -1) } ) / gy_all^2 * ((ph + 0.5) - 0.5 * pooled_height) / pooled_height * spatial_scale * src_h * exp(dsy) + 
+      //                   (           1          * gy_all - gy * sum_for_h{         1            } ) / gy_all^2 *     1                              / pooled_height * spatial_scale * src_h * exp(dsy) 
+
+      // dg / dcx = dg / dgx_final * dgx_final / dcx + dg / dgy_final * dgy_final / dcx
+      //          =   gy_final     * dgx_final / dcx +   gx_final     * 0
+      //          =   gy_final     * dgx_final / dcx
+      // ... 
+      */ 
+      // Add roi offsets and clip to input boundaries
+      hstart = min(max(hstart, 0), height);
+      hend   = min(max(hend, 0), height);
+      wstart = min(max(wstart, 0), width);
+      wend   = min(max(wend, 0), width);
+
+      // Define an empty pooling region to be zero
+      Dtype val_cx = 0, val_cy = 0, val_sx = 0, val_sy = 0; 
+      Dtype gain_x = 0, gain_y = 0;  
+      Dtype pw_ = static_cast<Dtype>(pw); 
+      Dtype ph_ = static_cast<Dtype>(ph);
+      Dtype pooled_width_  = static_cast<Dtype>(pooled_width); 
+      Dtype pooled_height_ = static_cast<Dtype>(pooled_height);
+      bottom_data += (roi_batch_ind * channels + c) * height * width;
+      Dtype w_mask = 0, h_mask = 0, coeff_x = 0, coeff_y = 0; 
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          int bottom_index = h * width + w;
+          Dtype w_ = w, h_ = h;  
+          gain_x = wdiff - abs((w_ - wctr));   
+          gain_y = hdiff - abs((h_ - hctr));   
+
+          w_mask = w_ >= wctr ? 1 : -1;   
+          h_mask = h_ >= hctr ? 1 : -1;  
+
+          //val_cx = val_cx + gain_y / gain_y_all * (w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) / (gain_x_all*gain_x_all)                            * spatial_scale * src_w * top_diff[index]; 
+          //val_cy = val_cy + gain_x / gain_x_all * (h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) / (gain_y_all*gain_y_all)                            * spatial_scale * src_h * top_diff[index];
+          //val_sx = val_sx + gain_y / gain_y_all *((         gain_x_all - gain_x * dgx_final_dwdiff_all) / (gain_x_all*gain_x_all) * (pw_+0.5-0.5*pooled_width) / pooled_width * spatial_scale * src_w * exp(dsx) + 
+          //                                        (w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) / (gain_x_all*gain_x_all) *            1               / pooled_width * spatial_scale * src_w * exp(dsx) ) * top_diff[index]; 
+          //val_sy = val_sy + gain_x / gain_x_all *((         gain_y_all - gain_y * dgy_final_dhdiff_all) / (gain_y_all*gain_y_all) * (ph_+0.5-0.5*pooled_height)/ pooled_hidth * spatial_scale * src_h * eyp(dsy) +
+          //                                        (h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) / (gain_y_all*gain_y_all) *            1               / pooled_hidth * spatial_scale * src_h * eyp(dsy) ) * top_diff[index];
+
+          //if (gain_x > 1e-10 && gain_y > 1e-10) {
+            coeff_x = bottom_data[bottom_index] * gain_y * spatial_scale * src_w * top_diff[index];
+            if (gain_x_all > 1e-10) {coeff_x = coeff_x / (gain_x_all*gain_x_all);} 
+            if (gain_y_all > 1e-10) {coeff_x = coeff_x / gain_y_all;}
+            val_cx = val_cx +  (w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) * coeff_x;
+            val_sx = val_sx + ((w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) * (pw_+0.5-0.5*pooled_width_) +
+                               (         gain_x_all - gain_x * dgx_final_dwdiff_all)) / pooled_width_ * coeff_x * exp(dst_scl_x);
+          
+            coeff_y = bottom_data[bottom_index] * gain_x * spatial_scale * src_h * top_diff[index];
+            if (gain_y_all > 1e-10) {coeff_y = coeff_y / (gain_y_all*gain_y_all);}
+            if (gain_x_all > 1e-10) {coeff_y = coeff_y / gain_x_all;}
+            val_cy = val_cy +  (h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) * coeff_y;
+            val_sy = val_sy + ((h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) * (ph_+0.5-0.5*pooled_height_) + 
+                               (         gain_y_all - gain_y * dgy_final_dhdiff_all)) / pooled_height_ * coeff_y * exp(dst_scl_y);
+          //}
+        }
+      }
+      /*int*/ buffer_index = n * (channels * pooled_height * pooled_width * 4) + c * (pooled_height * pooled_width * 4) + ph * (pooled_width * 4) + pw * 4; 
+      bottom_diff_delta_rois_buffer[buffer_index+0] = val_cx; 
+      bottom_diff_delta_rois_buffer[buffer_index+1] = val_cy; 
+      bottom_diff_delta_rois_buffer[buffer_index+2] = val_sx;
+      bottom_diff_delta_rois_buffer[buffer_index+3] = val_sy;
+    //}
+  }
+}
+
+
+extern "C"
+void inn_ROIWarping_updateGradInputAtomic(THCState *state,
+    THCudaTensor *gradInput_data, THCudaTensor *data,   
+    THCudaTensor *gradInput_delta_rois, THCudaTensor *delta_rois,
+    THCudaTensor *gradInput_delta_rois_buffer,
+    THCudaTensor *gradOutput, THCudaTensor *top_data_buffer, 
+    THCudaTensor* rois, int W, int H, double spatial_scale)
+{
+  THAssert(THCudaTensor_nDimension(state, data) == 4);
+  THAssert(THCudaTensor_nDimension(state, top_data_buffer) == 5);
+  THAssert(THCudaTensor_nDimension(state, rois) == 2 && rois->size[1] == 5);
+  THAssert(THCudaTensor_nDimension(state, delta_rois) == 2 && delta_rois->size[1] == 5);
+  THAssert(THCudaTensor_nDimension(state, rois) == THCudaTensor_nDimension(state, delta_rois) &&
+           rois->size[0] == delta_rois->size[0] &&
+           rois->size[1] == delta_rois->size[1]);
+  THAssert(THCudaTensor_isContiguous(state, data));
+  THAssert(THCudaTensor_isContiguous(state, top_data_buffer));
+  THAssert(THCudaTensor_isContiguous(state, rois));
+  THAssert(THCudaTensor_isContiguous(state, delta_rois));
+  long num_rois = rois->size[0];
+  long nInputPlane = data->size[1];
+  THCudaTensor_resizeAs(state, gradInput_data, data);
+  THCudaTensor_zero(state, gradInput_data);
+  THCudaTensor_resizeAs(state, gradInput_delta_rois, delta_rois);
+  THCudaTensor_zero(state, gradInput_delta_rois);
+  THCudaTensor_resize5d(state, gradInput_delta_rois_buffer, num_rois, nInputPlane, H, W, 4);
+  THCudaTensor_zero(state, gradInput_delta_rois_buffer);
+
+  //Backpropagation for data
+  long count = THCudaTensor_nElement(state, gradInput_data);
+  for (int nth_roi = 0; nth_roi < num_rois; ++nth_roi) {
+    ROIWarpBackwardData<float><<<GET_BLOCKS(count), CUDA_NUM_THREADS / 2, 0, THCState_getCurrentStream(state)>>>(
+        count,
+        THCudaTensor_data(state, top_data_buffer),
+        spatial_scale, nInputPlane, data->size[2], data->size[3], H, W, nth_roi,
+        THCudaTensor_data(state, rois),
+        THCudaTensor_data(state, delta_rois),
+        THCudaTensor_data(state, gradOutput), 
+        THCudaTensor_data(state, gradInput_data)
+        );
+  }
+
+  //Backpropagation for delta_roi
+  count = THCudaTensor_nElement(state, gradOutput);
+  ROIWarpBackwardDeltaROI<float><<<GET_BLOCKS(count), CUDA_NUM_THREADS / 2, 0, THCState_getCurrentStream(state)>>>(
+      count,
+      THCudaTensor_data(state, top_data_buffer),
+      spatial_scale, nInputPlane, data->size[2], data->size[3], H, W, 
+      THCudaTensor_data(state, rois),
+      THCudaTensor_data(state, delta_rois),
+      THCudaTensor_data(state, gradOutput),
+      THCudaTensor_data(state, data),
+      THCudaTensor_data(state, gradInput_delta_rois_buffer)
+      );
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in inn_ROIWarping_updateGradInputAtomic: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
diff --git a/ROIWarping.lua b/ROIWarping.lua
new file mode 100644
index 0000000..352a175
--- /dev/null
+++ b/ROIWarping.lua
@@ -0,0 +1,74 @@
+local ROIWarping,parent = torch.class('inn.ROIWarping', 'nn.Module')
+local C = inn.C
+
+--function ROIWarping:__init(W,H,spatial_scale)
+function ROIWarping:__init(H,W)
+  parent.__init(self)
+  assert(W and H, 'W and H have to be provided')
+  self.W = W
+  self.H = H
+  --self.spatial_scale = spatial_scale or 1
+
+  self.grid_gen = inn.ROIWarpingGridGenerator(self.H, self.W)
+  self.sample  = inn.ROIWarpingBilinearSample(self.H, self.W)
+  
+  self.gradInput = {}
+end
+
+--function ROIWarping:setSpatialScale(scale)
+--  self.spatial_scale = scale
+--  return self
+--end
+
+function ROIWarping:updateOutput(input)
+  assert(#input == 2 or #input == 3)
+  local data = input[1]
+  local rois = input[2]
+  local delta_rois 
+  if #input == 3 then
+    delta_rois = input[3]
+  else -- #input == 2 
+    self.delta_rois = self.delta_rois or rois.new()
+    self.delta_rois:resizeAs(rois):zero()
+    self.delta_rois[{{}, 1}] = rois[{{}, 1}] 
+    delta_rois = self.delta_rois
+  end
+
+  if torch.type(data) == 'torch.CudaTensor' then 
+    self.grid_gen:cuda()
+    self.sample:cuda()
+  end
+
+  self.grid_gen:updateOutput({rois, delta_rois})
+  self.sample:updateOutput({data, self.grid_gen.output_tmp[1], self.grid_gen.output_tmp[2], self.grid_gen.output_tmp[3]})
+
+  self.output = self.sample.output
+
+  return self.output
+end
+
+function ROIWarping:updateGradInput(input,gradOutput)
+  local data = input[1]
+  local rois = input[2]
+  local delta_rois
+  if #input == 3 then
+    delta_rois = input[3]
+  else -- #input == 2
+    self.delta_rois = self.delta_rois or data.new()
+    self.delta_rois:resizeAs(rois):zero()
+    self.delta_rois[{{}, 1}] = rois[{{}, 1}]
+    delta_rois = self.delta_rois
+  end
+
+  if torch.type(data) == 'torch.CudaTensor' then
+    self.grid_gen:cuda()
+    self.sample:cuda()
+  end
+
+  self.sample:updateGradInput({data, self.grid_gen.output_tmp[1], self.grid_gen.output_tmp[2], self.grid_gen.output_tmp[3]}, gradOutput)
+  self.grid_gen:updateGradInput({rois, delta_rois}, {self.sample.gradInput[2], self.sample.gradInput[3]})
+
+  self.gradInput = {self.sample.gradInput[1], self.grid_gen.gradInput[1], self.grid_gen.gradInput[2]}
+
+  return self.gradInput
+end
diff --git a/ROIWarpingBilinearSample.cu b/ROIWarpingBilinearSample.cu
new file mode 100644
index 0000000..ae890b4
--- /dev/null
+++ b/ROIWarpingBilinearSample.cu
@@ -0,0 +1,644 @@
+// ------------------------------------------------------------------
+// Fast R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Ross Girshick
+// ------------------------------------------------------------------
+
+// Torch port:
+// IMAGINE, Sergey Zagoruyko, Francisco Massa, 2015
+
+#include "THC.h"
+#include <algorithm>
+#include <cfloat>
+#include "assert.h"
+
+#include "common.h"
+
+#define NUM_BUFFERS 6
+#define PRECISION_LIMIT 1e-10
+#define MIN_BIN_SIZE 2.0f
+
+using std::max;
+using std::min;
+
+template <typename Dtype>
+__global__ void ROIWarpBilinearSampleForward(
+    const int nthreads, const Dtype* bottom_data,
+    const int channels, 
+    const int height, const int width, const int pooled_height, const int pooled_width,
+    const Dtype* bottom_grid_ctrs, const Dtype* bottom_bin_sizes, const Dtype* bottom_roi_batch_inds,  
+    Dtype* top_data, 
+    Dtype* top_data_buffer) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = bottom_roi_batch_inds[n] - 1;
+    int grid_ctr_ind = n * (pooled_height * pooled_width * 2) +  ph * (pooled_width * 2) + pw * 2;
+    int bin_size_ind = n * 2;
+ 
+    Dtype wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1;
+    Dtype hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1;
+
+    Dtype bin_size_w = max(bottom_bin_sizes[bin_size_ind+0], MIN_BIN_SIZE);
+    Dtype bin_size_h = max(bottom_bin_sizes[bin_size_ind+1], MIN_BIN_SIZE);
+ 
+    Dtype wstart_ = wctr - bin_size_w / 2.0;
+    Dtype hstart_ = hctr - bin_size_h / 2.0;
+    Dtype wend_   = wctr + bin_size_w / 2.0;
+    Dtype hend_   = hctr + bin_size_h / 2.0;
+              
+    int wstart = static_cast<int>(floor(wstart_)); 
+    int hstart = static_cast<int>(floor(hstart_)); 
+    int wend   = static_cast<int>( ceil(wend_));// + 1;
+    int hend   = static_cast<int>( ceil(hend_));// + 1;
+ 
+    //top_data[index] = hend+1;
+    //top_data[index] = wend+1;
+    //top_data[index] = hstart+1;
+    //top_data[index] = wstart+1;
+    //top_data[index] = wstart_+1;
+    //top_data[index] = wend_+1;
+    //top_data[index] = hctr+1;
+    //top_data[index] = wctr+1;
+    //top_data[index] = bin_size_w;
+    //top_data[index] = bin_size_h;
+    //top_data[index] = roi_batch_ind + 1; 
+   
+    //// Add roi offsets and clip to input boundaries
+    //hstart = min(max(hstart, 0), height);
+    //hend   = min(max(hend, 0),   height);
+    //wstart = min(max(wstart, 0), width );
+    //wend   = min(max(wend, 0),   width );
+
+    //top_data[index] = hstart+1; 
+    //top_data[index] = wstart+1;
+    //top_data[index] = hend+1;
+    //top_data[index] = wend+1;
+
+    // Auxilliary variables used in backprop 
+    Dtype w_mask = 0, h_mask = 0; 
+    Dtype dgx_final_dwctr_all  = 0;
+    Dtype dgx_final_dwdiff_all = 0;
+    Dtype dgy_final_dhctr_all  = 0;
+    Dtype dgy_final_dhdiff_all = 0;
+ 
+    // Define an empty pooling region to be zero
+    Dtype val = 0; Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0;   
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h <= hend; ++h) {
+      Dtype h_ = h;
+      h_mask = ((hstart_ <= h_ && h_ <= hend_) ? 1.0 : 0); 
+      for (int w = wstart; w <= wend; ++w) {
+        int bottom_index = h * width + w;
+        Dtype w_ = w;  
+        w_mask = ((wstart_ <= w_ && w_ <= wend_) ? 1.0 : 0);
+ 
+        //gain_x = (bin_size_w+1) - abs(w_ - wctr);
+        //gain_y = (bin_size_h+1) - abs(h_ - hctr);
+        gain_x = w_mask * (bin_size_w - abs(w_ - wctr));
+        gain_y = h_mask * (bin_size_h - abs(h_ - hctr));
+
+        gain = gain_x * gain_y;
+ 
+        if (0 <= h && h < height && 0 <= w && w < width) { 
+          val = val + gain * bottom_data[bottom_index];
+          //val = val + gain; // for debug
+        }
+        //val = val + gain; // for debug
+
+        if (h == hstart) {
+          gain_x_all = gain_x_all + gain_x;
+
+          // Update information used in backprop
+          dgx_final_dwctr_all  = dgx_final_dwctr_all  + w_mask * (w_ >= wctr ? 1 : -1);
+          dgx_final_dwdiff_all = dgx_final_dwdiff_all + w_mask;
+        }
+      }
+      gain_y_all = gain_y_all + gain_y;
+        
+      dgy_final_dhctr_all  = dgy_final_dhctr_all  + h_mask * (h >= hctr ? 1 : -1);
+      dgy_final_dhdiff_all = dgy_final_dhdiff_all + h_mask;
+    }
+    if (gain_x_all > PRECISION_LIMIT)
+      val = val / gain_x_all;
+    if (gain_y_all > PRECISION_LIMIT)  
+      val = val / gain_y_all;
+    top_data[index] = val;
+
+    //top_data[index] = gain_y_all; // for debug
+
+    if (c == 0) { 
+      int buffer_index = n * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS;
+      top_data_buffer[buffer_index+0] = gain_x_all; 
+      top_data_buffer[buffer_index+1] = gain_y_all;
+      top_data_buffer[buffer_index+2] = dgx_final_dwctr_all;
+      top_data_buffer[buffer_index+3] = dgy_final_dhctr_all;
+      top_data_buffer[buffer_index+4] = dgx_final_dwdiff_all;
+      top_data_buffer[buffer_index+5] = dgy_final_dhdiff_all;
+    }
+  }
+}
+
+extern "C"
+void inn_ROIWarpingBilinearSample_updateOutput(THCState *state,
+    THCudaTensor *output, THCudaTensor *output_buffer, 
+    THCudaTensor *data, THCudaTensor* grid_ctrs, THCudaTensor* bin_sizes, THCudaTensor* roi_batch_inds,
+    int width, int height)
+{
+  THAssert(THCudaTensor_nDimension(state, data) == 4);
+  THAssert(THCudaTensor_nDimension(state, grid_ctrs) == 4 && grid_ctrs->size[3] == 2);
+  THAssert(THCudaTensor_nDimension(state, bin_sizes) == 2 && bin_sizes->size[1] == 2);
+  THAssert(THCudaTensor_nDimension(state, roi_batch_inds) == 2 
+           && roi_batch_inds->size[0] == grid_ctrs->size[0] 
+           && roi_batch_inds->size[0] == bin_sizes->size[0]);
+  THAssert(THCudaTensor_isContiguous(state, data));
+  THAssert(THCudaTensor_isContiguous(state, grid_ctrs));
+  THAssert(THCudaTensor_isContiguous(state, bin_sizes));
+
+  long nInputPlane = data->size[1];
+
+  // update output
+  long count = THCudaTensor_nElement(state, output);
+  ROIWarpBilinearSampleForward<float><<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count,
+      THCudaTensor_data(state, data),
+      nInputPlane, data->size[2], data->size[3], height, width,
+      THCudaTensor_data(state, grid_ctrs),
+      THCudaTensor_data(state, bin_sizes),
+      THCudaTensor_data(state, roi_batch_inds),
+      THCudaTensor_data(state, output), 
+      THCudaTensor_data(state, output_buffer) 
+      );
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in inn_ROIWarpingBilinearSample_updateOutput: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
+
+template <typename Dtype>
+__global__ void ROIWarpBilinearBackwardData(
+    const int nthreads, 
+    const int channels, const int height, const int width, const int pooled_height, const int pooled_width, 
+    const int nth_roi,
+    const Dtype* bottom_grid_ctrs, 
+    const Dtype* bottom_bin_sizes, 
+    const Dtype* bottom_roi_batch_inds, 
+    const Dtype* top_data_buffer, 
+    const Dtype* top_diff,
+    Dtype* bottom_diff_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, h, w) is an element in the input 
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int roi_batch_ind = bottom_roi_batch_inds[nth_roi] - 1;
+ 
+    if (roi_batch_ind == n) {
+      int bin_size_ind = nth_roi * 2;
+      Dtype bin_size_w = max(bottom_bin_sizes[bin_size_ind+0], MIN_BIN_SIZE);
+      Dtype bin_size_h = max(bottom_bin_sizes[bin_size_ind+1], MIN_BIN_SIZE);
+
+      ///** for debug **/ 
+      //int top_buffer_index = nth_roi * (pooled_height * pooled_width * NUM_BUFFERS) + h * (pooled_width * NUM_BUFFERS) + w * NUM_BUFFERS;
+      ////gain_x_all = top_data_buffer[top_buffer_index+0];
+      ////gain_y_all = top_data_buffer[top_buffer_index+1];
+      ////bottom_diff_data[index] = top_data_buffer[top_buffer_index+1];
+      ///** til here **//
+
+      int grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) +  0 * (pooled_width * 2) + 0 * 2;
+      //Dtype roi_start_w = bottom_grid_ctrs[grid_ctr_ind+0] - bin_size_w / 2.0;
+      //Dtype roi_start_h = bottom_grid_ctrs[grid_ctr_ind+1] - bin_size_w / 2.0;
+
+      // wstart = floor(roi_start_w + pw * bin_size_w + bin_size_w / 2)
+      // -> wstart - roi_start_w - bin_size_w / 2 = bin_size_w * pw
+      // hstart = floor(roi_start_h + ph * bin_size_h + bin_size_h / 2)
+      // -> hstart - roi_start_h - bin_size_h / 2 = bin_size_h * pw
+
+      int pwstart = pooled_width, pwend = -1;
+      int wstart = 0, wend = 0;
+      Dtype wctr, wstart_, wend_;
+      for (int pw = 0; pw < pooled_width; pw++) {
+        //wctr = roi_start_w + pw * bin_size_w + bin_size_w / 2.0;
+        /*int*/ grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) +  0 * (pooled_width * 2) + pw * 2;
+        wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1;
+        wstart_ = wctr - bin_size_w / 2.0;
+        wend_   = wctr + bin_size_w / 2.0;
+
+        wstart = static_cast<int>(floor(wstart_));
+        wend   = static_cast<int>( ceil(wend_));
+
+        //wstart = min(max(wstart, 0), width -1);
+        //wend   = min(max(wend, 0),   width -1);
+     
+        if ((wstart <= w) && (w <= wend)) {
+          if (pw < pwstart) {
+            pwstart = pw;
+          }
+          if (pw > pwend) {
+            pwend   = pw;
+          } 
+        }
+      }
+
+      int phstart = pooled_height, phend = -1;
+      int hstart = 0, hend = 0;
+      Dtype hctr, hstart_, hend_; 
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        /*int*/ grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) +  ph * (pooled_width * 2) + 0 * 2;
+
+        hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1;
+        hstart_ = hctr - bin_size_h / 2.0;
+        hend_   = hctr + bin_size_h / 2.0;
+
+        hstart = static_cast<int>(floor(hstart_));
+        hend   = static_cast<int>( ceil(hend_));
+
+        //hstart = min(max(hstart, 0), height-1);
+        //hend   = min(max(hend, 0),   height-1);
+
+        if (hstart <= h && h <= hend) {
+          if (ph < phstart) {
+            phstart = ph;
+          }
+          if (ph > phend) {
+            phend   = ph;
+          }
+        }
+      }
+ 
+      //bottom_diff_data[index] = bottom_grid_ctrs[grid_ctr_ind+0] - 1;  
+      //bottom_diff_data[index] = bottom_grid_ctrs[nth_roi * (pooled_height * pooled_width * 2) +  0 * (pooled_width * 2) + 2 * 2 + 0] - bin_size_w / 2.0;
+      //bottom_diff_data[index] = (static_cast<dtype>(w) - roi_start_w - bin_size_w / 2.0) * bin_size_pw + 1; 
+      //bottom_diff_data[index] = (static_cast<dtype>(w+1) - roi_start_w - bin_size_w / 2.0) * bin_size_pw + 1; 
+      //bottom_diff_data[index] = phend+1; //pwend+1;
+      //bottom_diff_data[index] = phstart+1; //pwstart+1;
+      //bottom_diff_data[index] = roi_start_w + 1; 
+      //bottom_diff_data[index] = roi_start_h + 1; 
+ 
+      // Clip to top boundaries
+      phstart = min(max(phstart, 0), pooled_height-1);
+      phend =   min(max(phend, 0),   pooled_height-1);
+      pwstart = min(max(pwstart, 0), pooled_width -1);
+      pwend =   min(max(pwend, 0),   pooled_width -1);
+  
+      Dtype w_ = w, h_ = h; 
+      //Dtype wctr = 0, hctr = 0;
+      Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0;  
+      for (int ph = phstart; ph <= phend; ++ph) {
+        for (int pw = pwstart; pw <= pwend; ++pw) {
+          int top_index = nth_roi * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width  + pw;
+          int top_buffer_index = nth_roi * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS;
+          /*int*/ grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) + ph * (pooled_width * 2) + pw * 2;
+
+          wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1;
+          hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1;
+          gain_x_all = top_data_buffer[top_buffer_index+0]; 
+          gain_y_all = top_data_buffer[top_buffer_index+1]; 
+  
+          wstart_ = wctr - bin_size_w / 2.0;
+          wend_   = wctr + bin_size_w / 2.0;
+          hstart_ = hctr - bin_size_h / 2.0;
+          hend_   = hctr + bin_size_h / 2.0;
+
+          //gain_x = (bin_size_w+1) - abs(w_ - wctr); 
+          //gain_y = (bin_size_h+1) - abs(h_ - hctr);
+          gain_x = ((wstart_ <= w_ && w_ <= wend_) ? 1.0 : 0) * (bin_size_w - abs(w_ - wctr));
+          gain_y = ((hstart_ <= h_ && h_ <= hend_) ? 1.0 : 0) * (bin_size_h - abs(h_ - hctr));
+
+          if (gain_x_all > PRECISION_LIMIT)
+            gain_x = gain_x / gain_x_all; 
+          if (gain_y_all > PRECISION_LIMIT)  
+            gain_y = gain_y / gain_y_all; 
+  
+          gain = gain_x * gain_y;
+          bottom_diff_data[index] = bottom_diff_data[index] + gain * top_diff[top_index];
+        }
+      }
+
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void ROIWarpBilinearBackwardGridCtrs(
+    const int nthreads, 
+    const int channels, const int height, const int width, const int pooled_height, const int pooled_width,
+    //const int c,  
+    const Dtype* bottom_data, 
+    const Dtype* bottom_grid_ctrs, 
+    const Dtype* bottom_bin_sizes, 
+    const Dtype* bottom_roi_batch_inds, 
+    const Dtype* top_data_buffer, 
+    const Dtype* top_diff,
+    Dtype* bottom_diff_grid_ctrs_buffer) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw, d) is an element in the grid_ctrs_buffer 
+    int d  = index % 2; 
+    int pw = (index / 2) % pooled_width;
+    int ph = (index / 2 / pooled_width) % pooled_height;
+    int c  = (index / 2 / pooled_width / pooled_height) % channels;
+    int n  =  index / 2 / pooled_width / pooled_height / channels;
+
+    // get top buffer index and top buffers
+    int top_buffer_index = n * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS;
+    Dtype gain_x_all          = top_data_buffer[top_buffer_index+0];
+    Dtype gain_y_all          = top_data_buffer[top_buffer_index+1];
+    Dtype dgx_final_dwctr_all = top_data_buffer[top_buffer_index+2];
+    Dtype dgy_final_dhctr_all = top_data_buffer[top_buffer_index+3];
+
+    // get top index 
+    int top_index = n * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width + pw; 
+
+    // estimate grad
+    int roi_batch_ind = bottom_roi_batch_inds[n] - 1;
+    int grid_ctr_ind = n * (pooled_height * pooled_width * 2) +  ph * (pooled_width * 2) + pw * 2;
+    int bin_size_ind = n * 2;
+
+    Dtype wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1;
+    Dtype hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1;
+
+    Dtype bin_size_w = max(bottom_bin_sizes[bin_size_ind+0], MIN_BIN_SIZE);
+    Dtype bin_size_h = max(bottom_bin_sizes[bin_size_ind+1], MIN_BIN_SIZE);
+ 
+    Dtype wstart_ = wctr - bin_size_w / 2.0;
+    Dtype hstart_ = hctr - bin_size_h / 2.0;
+    Dtype wend_   = wctr + bin_size_w / 2.0;
+    Dtype hend_   = hctr + bin_size_h / 2.0;
+              
+    int wstart = static_cast<int>(floor(wstart_)); 
+    int hstart = static_cast<int>(floor(hstart_)); 
+    int wend   = static_cast<int>( ceil(wend_));// + 1;
+    int hend   = static_cast<int>( ceil(hend_));// + 1;
+  
+    //// Add roi offsets and clip to input boundaries
+    //hstart = min(max(hstart, 0), height);
+    //hend   = min(max(hend, 0),   height);
+    //wstart = min(max(wstart, 0), width );
+    //wend   = min(max(wend, 0),   width );
+
+    // Auxilliary variables used in backprop 
+    Dtype w_mask = 0, h_mask = 0; 
+    //Dtype dgx_final_dwctr_all  = 0;
+    //Dtype dgy_final_dhctr_all  = 0;
+
+    // output = g * input
+    // do / dwctr = input * dg / dwctr
+    // g = gx_final * gy_final 
+    // gx_final = gx / gx_all 
+    // dg / dwctr = dg / dgx_final * dgx_final / dwctr 
+    //            =  gy_final      * ( dgx/dwctr  * gx_all - gx * dgx_all/dwctr  ) / (gx_all)^2
+    //            =  gy_final      * ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2
+ 
+    // Define an empty pooling region to be zero
+    Dtype val = 0;  
+    Dtype gain = 0, gain_x = 0, gain_y = 0;
+    Dtype coeff_x = 0, coeff_y = 0;
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h <= hend; ++h) {
+      Dtype h_ = h;
+      h_mask = ((hstart_ <= h_ && h_ <= hend_) ? 1.0 : 0); 
+      for (int w = wstart; w <= wend; ++w) {
+        int bottom_index = h * width + w;
+        Dtype w_ = w;
+        w_mask = ((wstart_ <= w_ && w_ <= wend_) ? 1.0 : 0);
+
+        if (0 <= h && h < height && 0 <= w && w < width) {
+          //gain_x = (bin_size_w+1) - abs(w_ - wctr);
+          //gain_y = (bin_size_h+1) - abs(h_ - hctr);
+          gain_x = w_mask * (bin_size_w - abs(w_ - wctr));
+          gain_y = h_mask * (bin_size_h - abs(h_ - hctr));
+ 
+          if (d == 0) {
+            coeff_x = gain_y * bottom_data[bottom_index];
+            if (gain_x_all > PRECISION_LIMIT) {coeff_x = coeff_x / (gain_x_all*gain_x_all);}
+            if (gain_y_all > PRECISION_LIMIT) {coeff_x = coeff_x / gain_y_all;}
+            val = val + ((w_ >= wctr ? 1 : -1) * gain_x_all - gain_x * dgx_final_dwctr_all ) * coeff_x;
+          }
+          else if (d == 1) {
+            coeff_y = gain_x * bottom_data[bottom_index];
+            if (gain_y_all > PRECISION_LIMIT) {coeff_y = coeff_y / (gain_y_all*gain_y_all);}
+            if (gain_x_all > PRECISION_LIMIT) {coeff_y = coeff_y / gain_x_all;}
+            val = val + ((h >= hctr ? 1 : -1) * gain_y_all - gain_y * dgy_final_dhctr_all ) * coeff_y;
+          }
+          /** for debug **/ 
+          //gain = gain_x * gain_y; 
+          //if (gain_x_all > PRECISION_LIMIT) { gain = gain / gain_x_all; } 
+          //if (gain_y_all > PRECISION_LIMIT) { gain = gain / gain_y_all; }
+          //val = val + gain * bottom_data[bottom_index]; 
+          ////val = val + gain; 
+          /** til here **/
+        }
+      }
+    }
+    bottom_diff_grid_ctrs_buffer[index] = val * top_diff[top_index];
+
+    /** for debug **/ 
+    //bottom_diff_grid_ctrs_buffer[index] = top_diff[top_index];
+    //bottom_diff_grid_ctrs_buffer[index] = dgx_final_dwctr_all;
+    //bottom_diff_grid_ctrs_buffer[index] = dgy_final_dhctr_all;
+    //bottom_diff_grid_ctrs_buffer[index] = gain_x_all;
+    //bottom_diff_grid_ctrs_buffer[index] = gain_y_all;
+    //bottom_diff_grid_ctrs_buffer[index] = val;
+    //bottom_diff_grid_ctrs_buffer[index] = d+1;
+    //bottom_diff_grid_ctrs_buffer[index] = pw+1;
+    //bottom_diff_grid_ctrs_buffer[index] = ph+1;
+    /** til here **/ 
+  }
+}
+
+template <typename Dtype>
+__global__ void ROIWarpBilinearBackwardBinSizes(
+    const int nthreads, 
+    const int channels, const int height, const int width, const int pooled_height, const int pooled_width,
+    const Dtype* bottom_data, 
+    const Dtype* bottom_grid_ctrs, 
+    const Dtype* bottom_bin_sizes, 
+    const Dtype* bottom_roi_batch_inds, 
+    const Dtype* top_data_buffer, 
+    const Dtype* top_diff,
+    Dtype* bottom_diff_bin_sizes_buffer) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw, d) is an element in the grid_ctrs_buffer 
+    int d  = index % 2; 
+    int pw = (index / 2) % pooled_width;
+    int ph = (index / 2 / pooled_width) % pooled_height;
+    int c  = (index / 2 / pooled_width / pooled_height) % channels;
+    int n  =  index / 2 / pooled_width / pooled_height / channels;
+
+    // get top buffer index and top buffers
+    int top_buffer_index = n * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS;
+    Dtype gain_x_all           = top_data_buffer[top_buffer_index+0];
+    Dtype gain_y_all           = top_data_buffer[top_buffer_index+1];
+    Dtype dgx_final_dwdiff_all = top_data_buffer[top_buffer_index+4];
+    Dtype dgy_final_dhdiff_all = top_data_buffer[top_buffer_index+5];
+
+    // get top index 
+    int top_index = n * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width + pw; 
+
+    // estimate grad
+    int roi_batch_ind = bottom_roi_batch_inds[n] - 1;
+    int grid_ctr_ind = n * (pooled_height * pooled_width * 2) +  ph * (pooled_width * 2) + pw * 2;
+    int bin_size_ind = n * 2;
+
+    Dtype wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1;
+    Dtype hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1;
+
+    Dtype bin_size_w = bottom_bin_sizes[bin_size_ind+0];
+    Dtype bin_size_h = bottom_bin_sizes[bin_size_ind+1];
+ 
+    Dtype wstart_ = wctr - bin_size_w / 2.0;
+    Dtype hstart_ = hctr - bin_size_h / 2.0;
+    Dtype wend_   = wctr + bin_size_w / 2.0;
+    Dtype hend_   = hctr + bin_size_h / 2.0;
+              
+    int wstart = static_cast<int>(floor(wstart_)); 
+    int hstart = static_cast<int>(floor(hstart_)); 
+    int wend   = static_cast<int>( ceil(wend_)); 
+    int hend   = static_cast<int>( ceil(hend_));
+  
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height-1);
+    hend   = min(max(hend, 0),   height-1);
+    wstart = min(max(wstart, 0), width -1);
+    wend   = min(max(wend, 0),   width -1);
+
+    // Define an empty pooling region to be zero
+    Dtype val = 0;  
+    Dtype gain = 0, gain_x = 0, gain_y = 0;
+    Dtype coeff_x = 0, coeff_y = 0;
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h <= hend; ++h) {
+      Dtype h_ = h;
+      for (int w = wstart; w <= wend; ++w) {
+        int bottom_index = h * width + w;
+        Dtype w_ = w;
+
+        gain_x = (bin_size_w+1) - abs(w_ - wctr);
+        gain_y = (bin_size_h+1) - abs(h_ - hctr);
+
+        if (d == 0) {
+          coeff_x = gain_y * bottom_data[bottom_index];
+          if (gain_x_all > PRECISION_LIMIT) {coeff_x = coeff_x / (gain_x_all*gain_x_all);}
+          if (gain_y_all > PRECISION_LIMIT) {coeff_x = coeff_x / gain_y_all;}
+          val = val + (1 * gain_x_all - gain_x * dgx_final_dwdiff_all) * coeff_x;
+        }
+        else if (d == 1) {
+          coeff_y = gain_x * bottom_data[bottom_index];
+          if (gain_y_all > PRECISION_LIMIT) {coeff_y = coeff_y / (gain_y_all*gain_y_all);}
+          if (gain_x_all > PRECISION_LIMIT) {coeff_y = coeff_y / gain_x_all;}
+          val = val + (1 * gain_y_all - gain_y * dgy_final_dhdiff_all) * coeff_y;
+        }
+        /** for debug **/ 
+        //gain = gain_x * gain_y; 
+        //if (gain_x_all > PRECISION_LIMIT) { gain = gain / gain_x_all; } 
+        //if (gain_y_all > PRECISION_LIMIT) { gain = gain / gain_y_all; }
+        //val = val + gain * bottom_data[bottom_index]; 
+        ////val = val + gain; 
+        /** til here **/ 
+      }
+    }
+    bottom_diff_bin_sizes_buffer[index] = val * top_diff[top_index];
+
+    /** for debug **/ 
+    //bottom_diff_grid_ctrs_buffer[index] = top_diff[top_index];
+    //bottom_diff_grid_ctrs_buffer[index] = dgx_final_dwctr_all;
+    //bottom_diff_grid_ctrs_buffer[index] = dgy_final_dhctr_all;
+    //bottom_diff_grid_ctrs_buffer[index] = gain_x_all;
+    //bottom_diff_grid_ctrs_buffer[index] = val;
+    //bottom_diff_grid_ctrs_buffer[index] = d+1;
+    //bottom_diff_grid_ctrs_buffer[index] = pw+1;
+    //bottom_diff_grid_ctrs_buffer[index] = ph+1;
+    /** til here **/ 
+  }
+}
+
+extern "C"
+void inn_ROIWarpingBilinearSample_updateGradInput(THCState *state,
+    THCudaTensor *gradInput_data,      THCudaTensor *data,
+    THCudaTensor *gradInput_grid_ctrs, THCudaTensor *grid_ctrs, THCudaTensor *gradInput_grid_ctrs_buffer, 
+    THCudaTensor *gradInput_bin_sizes, THCudaTensor *bin_sizes, THCudaTensor *gradInput_bin_sizes_buffer,
+    THCudaTensor *roi_batch_inds,
+    THCudaTensor *output_buffer,
+    THCudaTensor *gradOutput,
+    int pooled_height, int pooled_width)
+{
+  THAssert(THCudaTensor_nDimension(state, data) == 4);
+  THAssert(THCudaTensor_nDimension(state, grid_ctrs) == 4 && grid_ctrs->size[3] == 2);
+  THAssert(THCudaTensor_nDimension(state, bin_sizes) == 2 && bin_sizes->size[1] == 2);
+  THAssert(THCudaTensor_nDimension(state, roi_batch_inds) == 2
+           && roi_batch_inds->size[0] == grid_ctrs->size[0]
+           && roi_batch_inds->size[0] == bin_sizes->size[0]);
+  THAssert(THCudaTensor_isContiguous(state, data));
+  THAssert(THCudaTensor_isContiguous(state, grid_ctrs));
+  THAssert(THCudaTensor_isContiguous(state, bin_sizes));
+
+  long num_rois = grid_ctrs->size[0];
+  long nInputPlane = data->size[1];
+
+  long count = 0; 
+
+  // backpropagation for data
+  for (int nth_roi = 0; nth_roi < num_rois; ++nth_roi) {
+    count = THCudaTensor_nElement(state, gradInput_data);
+    ROIWarpBilinearBackwardData<float><<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+        count,
+        nInputPlane, data->size[2], data->size[3], pooled_height, pooled_width,
+        nth_roi,
+        THCudaTensor_data(state, grid_ctrs),
+        THCudaTensor_data(state, bin_sizes),
+        THCudaTensor_data(state, roi_batch_inds),
+        THCudaTensor_data(state, output_buffer),
+        THCudaTensor_data(state, gradOutput),
+        THCudaTensor_data(state, gradInput_data)
+        );
+  }
+
+  // backpropagation for grid_ctrs 
+  count = THCudaTensor_nElement(state, gradInput_grid_ctrs_buffer);
+  ROIWarpBilinearBackwardGridCtrs<float><<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count,
+      nInputPlane, data->size[2], data->size[3], pooled_height, pooled_width,
+      THCudaTensor_data(state, data),
+      THCudaTensor_data(state, grid_ctrs),
+      THCudaTensor_data(state, bin_sizes),
+      THCudaTensor_data(state, roi_batch_inds),
+      THCudaTensor_data(state, output_buffer),
+      THCudaTensor_data(state, gradOutput),
+      THCudaTensor_data(state, gradInput_grid_ctrs_buffer)
+      );
+
+  // backpropagation for bin_sizes 
+  count = THCudaTensor_nElement(state, gradInput_bin_sizes_buffer);
+  ROIWarpBilinearBackwardBinSizes<float><<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count,
+      nInputPlane, data->size[2], data->size[3], pooled_height, pooled_width,
+      THCudaTensor_data(state, data),
+      THCudaTensor_data(state, grid_ctrs),
+      THCudaTensor_data(state, bin_sizes),
+      THCudaTensor_data(state, roi_batch_inds),
+      THCudaTensor_data(state, output_buffer),
+      THCudaTensor_data(state, gradOutput),
+      THCudaTensor_data(state, gradInput_bin_sizes_buffer)
+      );
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in inn_ROIWarpingBilinearSample_updateGradInput: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
+
+#undef NUM_BUFFERS 
+#undef PRECISION_LIMIT
+#undef MIN_BIN_SIZE 
diff --git a/ROIWarpingBilinearSample.lua b/ROIWarpingBilinearSample.lua
new file mode 100644
index 0000000..bff068e
--- /dev/null
+++ b/ROIWarpingBilinearSample.lua
@@ -0,0 +1,122 @@
+local ROIWarpingBilinearSample,parent = torch.class('inn.ROIWarpingBilinearSample', 'nn.Module')
+local C = inn.C
+
+local buffer_numbers = 6
+
+--function ROIWarpingBilinearSample:__init(height, width, spatial_scale)
+function ROIWarpingBilinearSample:__init(height, width)
+  parent.__init(self)
+  assert(width and height, 'height and width have to be provided')
+  self.width = width
+  self.height = height
+  --self.spatial_scale = spatial_scale or 1
+  self.gradInput = {}
+end
+
+--function ROIWarpingBilinearSample:setSpatialScale(scale)
+--  self.spatial_scale = scale
+--  return self
+--end
+
+function ROIWarpingBilinearSample:updateOutput(input)
+  assert(#input == 4)
+  local data = input[1]
+  local grid_ctrs = input[2]
+  local bin_sizes = input[3] 
+  local roi_batch_inds = input[4] 
+
+  local num_rois = roi_batch_inds:size(1)
+  local nchannels = data:size(2)
+
+  assert(grid_ctrs:size(1) == num_rois and 
+         grid_ctrs:size(2) == self.height and
+         grid_ctrs:size(3) == self.width and 
+         grid_ctrs:size(4) == 2)
+  assert(bin_sizes:size(1) == num_rois and 
+         bin_sizes:size(2) == 2)
+  
+  self.output = self.output or data.new()
+  self.output:resize(num_rois, nchannels, self.height, self.width):fill(0)
+  self.output_buffer = self.output_buffer or data.new()
+  self.output_buffer:resize(num_rois, self.height, self.width, buffer_numbers):fill(0)
+ 
+  C.inn_ROIWarpingBilinearSample_updateOutput(cutorch.getState(),
+    self.output:cdata(), self.output_buffer:cdata(),
+    data:cdata(), grid_ctrs:cdata(), bin_sizes:cdata(), roi_batch_inds:cdata(), 
+    self.width, self.height
+    )--, self.spatial_scale)
+
+  return self.output
+end
+
+function ROIWarpingBilinearSample:updateGradInput(input,gradOutput)
+  assert(#input == 4)
+  local data = input[1]
+  local grid_ctrs = input[2]
+  local bin_sizes = input[3]
+  local roi_batch_inds = input[4] 
+
+  local batch_size = data:size(1)
+  local num_rois = roi_batch_inds:size(1)
+  local nchannels = data:size(2)
+
+  assert(self.output_buffer)
+  assert(self.output_buffer:size(1) == num_rois and 
+         self.output_buffer:size(2) == self.height and 
+         self.output_buffer:size(3) == self.width and
+         self.output_buffer:size(4) == buffer_numbers)
+
+  self.gradInput_data = self.gradInput_data or data.new()                               -- b x c x h x w
+  self.gradInput_grid_ctrs = self.gradInput_grid_ctrs or grid_ctrs.new()                -- n x h x w x 2
+  self.gradInput_grid_ctrs_buffer = self.gradInput_grid_ctrs_buffer or grid_ctrs.new()  -- n x c x h x w x 2
+  self.gradInput_bin_sizes = self.gradInput_bin_sizes or bin_sizes.new()                -- n x 2
+  self.gradInput_bin_sizes_buffer = self.gradInput_bin_sizes_buffer or bin_sizes.new()  -- n x c x h x w x 2
+  self.gradInput_roi_batch_inds = self.gradInput_roi_batch_inds or roi_batch_inds.new() -- n x 2
+
+  self.gradInput_data:resizeAs(data):fill(0)
+  self.gradInput_grid_ctrs:resizeAs(grid_ctrs):fill(0)
+  self.gradInput_grid_ctrs_buffer:resize(num_rois, nchannels, self.height, self.width, 2):fill(0)
+  self.gradInput_bin_sizes:resizeAs(bin_sizes):fill(0)
+  self.gradInput_bin_sizes_buffer:resize(num_rois, nchannels, self.height, self.width, 2):fill(0)
+  self.gradInput_roi_batch_inds:resize(num_rois, 2):fill(0)
+
+  --print(self.output_buffer:select(4,1))
+  --print(self.output_buffer:select(4,2))
+  --print(self.output_buffer:select(4,3))
+  --print(self.output_buffer:select(4,4))
+
+  C.inn_ROIWarpingBilinearSample_updateGradInput(cutorch.getState(),
+    self.gradInput_data:cdata(),      data:cdata(),
+    self.gradInput_grid_ctrs:cdata(), grid_ctrs:cdata(), self.gradInput_grid_ctrs_buffer:cdata(), 
+    self.gradInput_bin_sizes:cdata(), bin_sizes:cdata(), self.gradInput_bin_sizes_buffer:cdata(),
+    roi_batch_inds:cdata(),
+    self.output_buffer:cdata(), 
+    gradOutput:cdata(),
+    self.height, self.width
+    ) --, self.spatial_scale)
+
+ --print(self.gradInput_bin_sizes_buffer)
+
+  self.gradInput_grid_ctrs:copy(self.gradInput_grid_ctrs_buffer:sum(2):view(num_rois, self.height, self.width, 2))
+  self.gradInput_bin_sizes:copy(self.gradInput_bin_sizes_buffer:sum(2):sum(3):sum(4):view(num_rois, 2))
+ 
+  --print(self.gradInput_grid_ctrs_buffer:select(2, 1):select(4, 1))
+  ----print(self.gradInput_grid_ctrs_buffer:select(2, 1):select(4, 2))
+  --print(self.gradInput_grid_ctrs_buffer:select(2, 2):select(4, 1))
+  --print(self.gradInput_grid_ctrs_buffer:select(2, 3):select(4, 1))
+  --for c = 1, nchannels do 
+  --  print(self.gradInput_grid_ctrs_buffer:select(2, c))
+  --end
+  
+  self.gradInput[1] = self.gradInput_data
+  self.gradInput[2] = self.gradInput_grid_ctrs
+  self.gradInput[3] = self.gradInput_bin_sizes
+  self.gradInput[4] = self.gradInput_roi_batch_inds
+ 
+  return self.gradInput 
+end
+
+function ROIWarpingBilinearSample:clearState()
+   nn.utils.clear(self, 'gradInput_data', 'gradInput_grid_ctrs', 'gradInput_grid_ctrs_buffer', 'gradInput_bin_sizes', 'gradInput_bin_sizes_buffer', 'gradInput_roi_batch_inds')
+   return parent.clearState(self)
+end
diff --git a/ROIWarpingGridGenerator.lua b/ROIWarpingGridGenerator.lua
new file mode 100644
index 0000000..e76a4f3
--- /dev/null
+++ b/ROIWarpingGridGenerator.lua
@@ -0,0 +1,236 @@
+--[[
+  This code is borrowed from AffineGridGeneratorBHWD.lua in https://github.com/qassemoquab/stnbhwd
+]]
+
+local RWGG, parent = torch.class('inn.ROIWarpingGridGenerator', 'nn.Module')
+
+local function fast_rcnn_bbox_transform_inv(rois, delta_rois)
+-- rois       : N by 4 torch.Tensor; for each row, rois[{n, {}}]       == x_start, y_start, x_end, y_end (in image coordinates)
+-- delta_rois : N by 4 torch.Tensor; for each row, delta_rois[{n, {}}] == dx, dy, dw, dh (in fast-rcnn notation)
+
+  local src_w = rois[{{},3}] - rois[{{},1}] + 1;
+  local src_h = rois[{{},4}] - rois[{{},2}] + 1;
+  local src_ctr_x = rois[{{},1}] + 0.5*(src_w-1.0);
+  local src_ctr_y = rois[{{},2}] + 0.5*(src_h-1.0);
+
+  local dst_ctr_x = delta_rois[{{},1}]; -- dx (in fast-rcnn notation)
+  local dst_ctr_y = delta_rois[{{},2}]; -- dy (in fast-rcnn notation)
+  local dst_scl_x = delta_rois[{{},3}]; -- dw (in fast-rcnn notation)
+  local dst_scl_y = delta_rois[{{},4}]; -- dh (in fast-rcnn notation)
+
+  local pred_ctr_x = torch.cmul(dst_ctr_x, src_w) + src_ctr_x;
+  local pred_ctr_y = torch.cmul(dst_ctr_y, src_h) + src_ctr_y;
+  local pred_w = torch.cmul(torch.exp(dst_scl_x), src_w);
+  local pred_h = torch.cmul(torch.exp(dst_scl_y), src_h);
+
+  local roi_start_w = pred_ctr_x - 0.5*(pred_w-1);
+  local roi_start_h = pred_ctr_y - 0.5*(pred_h-1);
+  local roi_end_w =   pred_ctr_x + 0.5*(pred_w-1);
+  local roi_end_h =   pred_ctr_y + 0.5*(pred_h-1);
+
+  return torch.cat({roi_start_w, roi_start_h, roi_end_w, roi_end_h}, 2)
+end
+
+--function RWGG:__init(height, width, spatial_scale)
+function RWGG:__init(height, width)
+   parent.__init(self)
+   assert(height > 1)
+   assert(width > 1)
+   self.height = height
+   self.width = width
+
+   self.output_tmp = {}
+   self.gradInput = {}
+
+   --self.spatial_scale = spatial_scale or 1
+
+   self.baseGrid = torch.Tensor(self.height, self.width, 2) -- Grid for input image
+   for i=1,self.width do
+      self.baseGrid:select(3,1):select(2,i):fill(i-1)
+   end
+   for j=1,self.height do
+      self.baseGrid:select(3,2):select(1,j):fill(j-1)
+   end
+   self.batchGrid = torch.Tensor(1, height, width, 2):copy(self.baseGrid)
+end
+
+function RWGG:updateOutput(input) --(_transformMatrix)
+  assert(#input == 1 or #input == 2)
+  local rois = input[1]
+  local delta_rois
+  if #input == 2 then
+    delta_rois = input[2]
+  else -- #input == 2
+    self.delta_rois = self.delta_rois or rois.new()
+    self.delta_rois:resizeAs(rois):zero()
+    self.delta_rois[{{}, 1}] = rois[{{}, 1}]
+    delta_rois = self.delta_rois
+  end
+  assert(rois:dim() == 2 and delta_rois:dim() == 2)
+  assert(rois:size(2) == 5 and delta_rois:size(2) == 5)
+
+  local batch_size = rois:size(1)
+
+  if self.batchGrid:size(1) ~= batch_size then
+     self.batchGrid:resize(batch_size, self.height, self.width, 2)
+     for i=1,batch_size do
+        self.batchGrid:select(1,i):copy(self.baseGrid)
+     end
+  end
+
+  -- allocate output
+  self.output_tmp[1] = self.output_tmp[1] or rois.new()
+  self.output_tmp[2] = self.output_tmp[2] or rois.new()
+  self.output_tmp[3] = self.output_tmp[3] or rois.new()
+  local grid_ctrs = self.output_tmp[1]
+  local bin_sizes = self.output_tmp[2]
+  local roi_batch_inds = self.output_tmp[3]
+ 
+  -- prepare msc 
+  local pred_rois = fast_rcnn_bbox_transform_inv(rois[{{}, {2, 5}}], delta_rois[{{}, {2, 5}}])
+
+  local rois_width  = pred_rois[{{}, 3}] - pred_rois[{{}, 1}]
+  local rois_height = pred_rois[{{}, 4}] - pred_rois[{{}, 2}]
+  local rois_start_width = pred_rois[{{}, 1}]
+  local rois_start_height = pred_rois[{{}, 2}]
+
+  local bin_size_w = rois_width  / self.width 
+  local bin_size_h = rois_height / self.height
+ 
+  grid_ctrs:resize(batch_size, self.height, self.width, 2):fill(0) -- b x h x w x 2 (x, y == width, height)
+  bin_sizes:resize(batch_size, 2):fill(0)                          -- b x 2         (x, y == width, height) 
+  roi_batch_inds:resize(batch_size, 1):fill(0)
+  
+  -- update roi_batch_inds
+  roi_batch_inds:copy(rois[{{},{1}}])
+
+  -- update bin_sizes
+  bin_sizes:select(2,1):copy(bin_size_w:reshape(batch_size, 1)) -- width 
+  bin_sizes:select(2,2):copy(bin_size_h:reshape(batch_size, 1)) -- height
+
+  -- update grid_ctrs 
+  local grid_ctrs_w = grid_ctrs[{{}, {}, {}, {1}}] -- allocate address 
+  grid_ctrs_w:copy(self.batchGrid[{{}, {}, {}, {1}}])
+             :cmul(bin_size_w:reshape(batch_size, 1, 1, 1)
+                             :expand(batch_size, self.height, self.width, 1))
+             :add(bin_size_w:reshape(batch_size, 1, 1, 1)
+                            :expand(batch_size, self.height, self.width, 1) / 2)
+             :add(rois_start_width:reshape(batch_size, 1, 1, 1)
+                                  :expand(batch_size, self.height, self.width, 1))
+  local grid_ctrs_h = grid_ctrs[{{}, {}, {}, {2}}] -- allocate address
+  grid_ctrs_h:copy(self.batchGrid[{{}, {}, {}, {2}}])
+             :cmul(bin_size_h:reshape(batch_size, 1, 1, 1)
+                             :expand(batch_size, self.height, self.width, 1))
+             :add(bin_size_h:reshape(batch_size, 1, 1, 1)
+                            :expand(batch_size, self.height, self.width, 1) / 2)
+             :add(rois_start_height:reshape(batch_size, 1, 1, 1)
+                                   :expand(batch_size, self.height, self.width, 1))
+
+  return self.output_tmp
+end
+
+function RWGG:updateGradInput(input, gradOutput) --(_transformMatrix, _gradGrid)
+  assert(#input == 1 or #input == 2)
+  local rois = input[1]
+  local delta_rois
+  if #input == 2 then
+    delta_rois = input[2]
+  else -- #input == 2
+    self.delta_rois = self.delta_rois or rois.new()
+    self.delta_rois:resizeAs(rois):zero()
+    self.delta_rois[{{}, 1}] = rois[{{}, 1}]
+    delta_rois = self.delta_rois
+  end
+  assert(rois:dim() == 2 and delta_rois:dim() == 2)
+  assert(rois:size(2) == 5 and delta_rois:size(2) == 5)
+
+  local batch_size = rois:size(1)
+
+  self.batchGrid = self.batchGrid:typeAs(rois)
+  self.baseGrid = self.baseGrid:typeAs(rois)
+
+  if self.batchGrid:size(1) ~= batch_size then
+     self.batchGrid:resize(batch_size, self.height, self.width, 2)
+     for i=1,batch_size do
+        self.batchGrid:select(1,i):copy(self.baseGrid)
+     end
+  end
+
+  -- init output buffer
+  self.gradInput_rois = self.gradInput_rois or rois.new()
+  self.gradInput_delta_rois = self.gradInput_delta_rois or delta_rois.new()
+  self.gradInput_rois:resizeAs(rois):zero()
+  self.gradInput_delta_rois:resizeAs(delta_rois):zero()
+
+  -- prepare msc 
+  --local pred_rois = fast_rcnn_bbox_transform_inv(rois[{{}, {2, 5}}], delta_rois[{{}, {2, 5}}])
+
+  --local rois_width  = pred_rois[{{}, 3}] - pred_rois[{{}, 1}]
+  --local rois_height = pred_rois[{{}, 4}] - pred_rois[{{}, 2}]
+
+  --local bin_size_w = rois_width  / self.width
+  --local bin_size_h = rois_height / self.height
+
+  local src_width  = rois[{{}, {4}}] - rois[{{}, {2}}] + 1; src_width  = src_width:reshape(batch_size, 1, 1)
+  local src_height = rois[{{}, {5}}] - rois[{{}, {3}}] + 1; src_height = src_height:reshape(batch_size, 1, 1)
+
+  local flattenedBatchGrid = self.batchGrid:view(batch_size, self.width*self.height, 2)
+
+  -- grad from grid_ctrs 
+
+  -- drsw / dcx = drsw / dpcx * dpcx / dcx = spatial_scale * src_w
+  -- drew / dcx = drew / dpcx * dpcx / dcx = spatial_scale * src_w
+  -- drsh / dcy = drsh / dpcy * dpcy / dcy = spatial_scale * src_h
+  -- dreh / dcy = dreh / dpcy * dpcy / dcy = spatial_scale * src_h
+
+  -- drsw / dsx = drsw / dpw * dpw / dsx = -0.5 * spatial_scale * src_w * exp(dsx)
+  -- drew / dsx = drew / dpw * dpw / dsx =  0.5 * spatial_scale * src_w * exp(dsx)
+  -- drsh / dsy = drsh / dph * dph / dsy = -0.5 * spatial_scale * src_h * exp(dsy)
+  -- dreh / dsy = dreh / dph * dph / dsy =  0.5 * spatial_scale * src_h * exp(dsy)
+
+  -- grid_ctr_w = rsw + bin_size_w / 2 + pw * bin_size_w
+  --            = rsw + (0.5 + pw) * bin_size_w 
+  --            = rsw + (0.5 + pw) * (rew - rsw) / self.width
+  --            = f(rsw, rew)
+  -- dwctr / dcx = dwctr / drsw * drsw / dcx + dwctr / drew * drew / dcx
+  --             = (1 + (0.5 + pw) / self.width * (-1)) * src_w  + ((0.5 + pw) / self.width * 1) * src_w
+  --             = spatial_scale * src_w 
+  -- dhctr / dcy = spatial_scale * src_h
+  -- dwctr / dsx = dwctr / drsw * drsw / dsx + dwctr / drew * drew / dsx  
+  --             = (1 + (0.5 + pw) / self.width * (-1)) * (-0.5 * spatial_scale * src_w * exp(dsx))
+  --             + (    (0.5 + pw) / self.width *   1 ) * ( 0.5 * spatial_scale * src_w * exp(dsx))
+  --             = (-1 + (0.5 + pw) / self.width * 2) * 0.5 * spatial_scale * src_w * exp(dsx)
+  --             = 0.5 * spatial_scale * src_w * exp(dsx) * (-1 + 2 * (0.5 + pw) / self.width)  
+  --             = ((pw + 0.5) / self.width  - 0.5) * spatial_scale * src_w * exp(dsx)
+  -- dhctr / dsy = ((ph + 0.5) / self.height - 0.5) * spatial_scale * src_h * exp(dsy)
+
+  -- grad from bin_sizes
+  
+  -- dbw / dcx = dbw / drw * drw / dcx = 0
+  -- dbh / dcy = dbh / drh * drh / dcy = 0
+  -- dbw / dsx = dbw / drw * drw / dsx = 1 / self.width  * spatial_scale * src_w * exp(dsx)
+  -- dbh / dsy = dbh / drh * drh / dsy = 1 / self.height * spatial_scale * src_h * exp(dsy)
+
+  local flattened_grid_ctrs = gradOutput[1]:view(batch_size, self.height*self.width, 2) -- b x ph x pw x 2
+  local flattened_bin_sizes = gradOutput[2]                                             -- b x 2
+  local flattened_gradInput_delta_rois = self.gradInput_delta_rois[{{},{2, 5}}]         -- b x 4
+
+  flattened_gradInput_delta_rois[{{}, {1}}]:copy(torch.sum(torch.cmul( src_width:expand(batch_size, self.height * self.width, 1), flattened_grid_ctrs[{{}, {}, {1}}]), 2):reshape(batch_size, 1))
+  flattened_gradInput_delta_rois[{{}, {2}}]:copy(torch.sum(torch.cmul(src_height:expand(batch_size, self.height * self.width, 1), flattened_grid_ctrs[{{}, {}, {2}}]), 2):reshape(batch_size, 1))
+  flattened_gradInput_delta_rois[{{}, {3}}]:copy(torch.sum(torch.cmul( ((flattenedBatchGrid[{{}, {}, {1}}] + 0.5) / self.width - 0.5),  
+                                                                       flattened_grid_ctrs[{{}, {}, {1}}]), 2):reshape(batch_size, 1))
+  flattened_gradInput_delta_rois[{{}, {3}}]:add(torch.sum(torch.mul(flattened_bin_sizes[{{}, {1}}], 1/self.width), 2))
+                                           :cmul(torch.exp(delta_rois[{{}, {4}}]))
+                                           :cmul(src_width)
+  flattened_gradInput_delta_rois[{{}, {4}}]:copy(torch.sum(torch.cmul( ((flattenedBatchGrid[{{}, {}, {2}}] + 0.5) / self.height - 0.5),  
+                                                                       flattened_grid_ctrs[{{}, {}, {2}}]), 2):reshape(batch_size, 1))
+  flattened_gradInput_delta_rois[{{}, {4}}]:add(torch.sum(torch.mul(flattened_bin_sizes[{{}, {2}}], 1/self.height), 2))
+                                           :cmul(torch.exp(delta_rois[{{}, {5}}]))
+                                           :cmul(src_height)
+ 
+  -- update output
+  self.gradInput[1] = self.gradInput_rois 
+  self.gradInput[2] = self.gradInput_delta_rois
+
+  return self.gradInput
+end
diff --git a/ffi.lua b/ffi.lua
index 398af4f..476b1b5 100644
--- a/ffi.lua
+++ b/ffi.lua
@@ -20,6 +20,29 @@ void inn_ROIPooling_updateOutputV2(THCState *state,
 void inn_ROIPooling_updateGradInputAtomic(THCState *state,
     THCudaTensor *gradInput, THCudaTensor *indices, THCudaTensor *data,
     THCudaTensor *gradOutput, THCudaTensor* rois, int W, int H, double spatial_scale);
+
+void inn_ROIWarping_updateOutput(THCState *state,
+    THCudaTensor *output, THCudaTensor *output_buffer, 
+    THCudaTensor *data, THCudaTensor* rois, THCudaTensor* delta_rois, int W, int H, double spatial_scale);
+void inn_ROIWarping_updateGradInputAtomic(THCState *state,
+    THCudaTensor *gradInput_data, THCudaTensor *data, 
+    THCudaTensor *gradInput_delta_rois, THCudaTensor *delta_rois,
+    THCudaTensor *gradInput_delta_rois_buffer,
+    THCudaTensor *gradOutput, THCudaTensor *top_data_buffer,
+    THCudaTensor* rois, int W, int H, double spatial_scale);
+
+void inn_ROIWarpingBilinearSample_updateOutput(THCState *state,
+    THCudaTensor *output, THCudaTensor *output_buffer,  
+    THCudaTensor *data, THCudaTensor* grid_ctrs, THCudaTensor* bin_sizes, THCudaTensor* roi_batch_inds,
+    int width, int height);
+void inn_ROIWarpingBilinearSample_updateGradInput(THCState *state,
+    THCudaTensor *gradInput_data,      THCudaTensor *data,
+    THCudaTensor *gradInput_grid_ctrs, THCudaTensor *grid_ctrs, THCudaTensor *gradInput_grid_ctrs_buffer,
+    THCudaTensor *gradInput_bin_sizes, THCudaTensor *bin_sizes, THCudaTensor *gradInput_bin_sizes_buffer,
+    THCudaTensor *roi_batch_inds,
+    THCudaTensor *output_buffer, 
+    THCudaTensor *gradOutput, 
+    int pooled_height, int pooled_width);
 ]]
 
 return ffi.load(libpath)
diff --git a/init.lua b/init.lua
index 3122974..c4796ba 100644
--- a/init.lua
+++ b/init.lua
@@ -9,4 +9,7 @@ require 'inn.MeanSubtraction'
 require 'inn.SpatialPyramidPooling'
 require 'inn.SpatialSameResponseNormalization'
 require 'inn.ROIPooling'
+require 'inn.ROIWarping'
+require 'inn.ROIWarpingGridGenerator'
+require 'inn.ROIWarpingBilinearSample'
 return inn
diff --git a/test/test_jacobian.lua b/test/test_jacobian.lua
index 1ef9281..0b5aa79 100644
--- a/test/test_jacobian.lua
+++ b/test/test_jacobian.lua
@@ -7,7 +7,31 @@ local precision = 1e-3
 
 local inntest = torch.TestSuite()
 
+local function delta_rois_to_rois(rois, delta_rois)
+  local src_w = rois[{{},3}] - rois[{{},1}] + 1;
+  local src_h = rois[{{},4}] - rois[{{},2}] + 1;
+  local src_ctr_x = rois[{{},1}] + 0.5*(src_w-1.0);
+  local src_ctr_y = rois[{{},2}] + 0.5*(src_h-1.0);
 
+  local dst_ctr_x = delta_rois[{{},1}]; -- dx (in fast-rcnn notation) = cx (in here)
+  local dst_ctr_y = delta_rois[{{},2}]; -- dy (in fast-rcnn notation) = cy (in here)
+  local dst_scl_x = delta_rois[{{},3}]; -- dw (in fast-rcnn notation) = sx (in here)
+  local dst_scl_y = delta_rois[{{},4}]; -- dh (in fast-rcnn notation) = sy (in here)
+
+  local pred_ctr_x = torch.cmul(dst_ctr_x, src_w) + src_ctr_x;
+  local pred_ctr_y = torch.cmul(dst_ctr_y, src_h) + src_ctr_y;
+  local pred_w = torch.cmul(torch.exp(dst_scl_x), src_w);
+  local pred_h = torch.cmul(torch.exp(dst_scl_y), src_h);
+
+  local roi_start_w = pred_ctr_x - 0.5*(pred_w-1)
+  local roi_start_h = pred_ctr_y - 0.5*(pred_h-1)
+  local roi_end_w =   pred_ctr_x + 0.5*(pred_w-1)
+  local roi_end_h =   pred_ctr_y + 0.5*(pred_h-1)
+
+  return torch.cat({roi_start_w, roi_start_h, roi_end_w, roi_end_h}, 2)
+end
+
+--[[
 function inntest.SpatialStochasticPooling()
    local from = math.random(1,5)
    local ki = math.random(1,4)
@@ -98,7 +122,7 @@ function inntest.SpatialSameResponseNormalization()
     local err = jac.testJacobian(module, input, nil, nil, 1e-3)
     mytester:assertlt(err, precision, 'error on state (Batch) ')
 end
-
+]]
 function randROI(sz, n)
   assert(sz:size()==4, "need 4d size")
   local roi=torch.Tensor(n,5)
@@ -127,7 +151,7 @@ function testJacobianWithRandomROI(cls, v2)
   for i=1,numRepeat do
     local input = torch.rand(batchSize, 1, H, W);
     local roi = randROI(input:size(), numRoi)
-    local module = cls.new(h, w, 1, roi)
+    local module = cls.new(w, h, 1, roi)
     module.v2 = v2
     local err = jac.testJacobian(module, input, nil, nil, 1e-3)
     mytester:assertlt(err, precision, 'error on ROIPooling '..(v2 and 'v2' or 'v1'))
@@ -152,6 +176,416 @@ function inntest.ROIPooling()
   testJacobianWithRandomROI(FixedROIPooling, true)
 end
 
+function testJacobianWithRandomROIForROIWarpingData(cls)
+  --pooling grid size
+  local w=4; 
+  local h=4;
+  --input size 
+  local W=w*2;
+  local H=h*2;
+
+  local batchSize = 3
+  local numRoi = batchSize
+  local numRepeat = 3
+
+  torch.manualSeed(0)
+  for i=1,numRepeat do
+    local input = torch.rand(batchSize, 1, H, W);
+    local roi = randROI(input:size(), numRoi)
+    local delta_roi = roi:clone()
+    delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4) 
+    local module = cls.new(w, h, 1, roi, delta_roi)
+
+    local orig = input:clone() 
+    local err = jac.testJacobian(module, input, nil, nil, 1e-3)
+    mytester:assertlt(err, precision, 'error on ROIWarping ')
+  end
+end
+
+function inntest.ROIWarpingData()
+  local FixedROIWarping, parent = torch.class('FixedROIWarping', 'inn.ROIWarping')
+  function FixedROIWarping:__init(W, H, s, roi, delta_roi)
+    self.roi = roi
+    self.delta_roi = delta_roi
+    parent.__init(self, W, H, s)
+    self:cuda()
+  end
+
+  function FixedROIWarping:updateOutput(input)
+    return parent.updateOutput(self,{input:cuda(), self.roi, self.delta_roi})
+  end
+  function FixedROIWarping:updateGradInput(input, gradOutput)
+    return parent.updateGradInput(self,{input:cuda(), self.roi, self.delta_roi}, gradOutput)[1]
+  end
+
+  testJacobianWithRandomROIForROIWarpingData(FixedROIWarping)
+end
+--[[
+----------------------------------------------------------------------
+function testJacobianWithRandomROIForROIWarpingDeltaROI(cls)
+  --pooling grid size
+  local w=4;
+  local h=4;
+  --img size
+  local W=w*2;
+  local H=h*2;
+  
+
+  local batchSize = 3
+  local numRoi = batchSize
+  local numRepeat = 3
+
+  torch.manualSeed(0)
+  for i=1,numRepeat do
+    local img = torch.rand(batchSize, 3, H, W);
+    --local roi = torch.Tensor{1, 1, 1, W, H}:reshape(1, 5)
+    local roi = randROI(img:size(), numRoi)
+    local input = torch.rand(numRoi, 4)
+    local module = cls.new(w, h, 1, roi, img)
+
+    print('---0000000000000000000000000000')
+    --print(img)
+    print(roi)
+    print(input)
+    print(delta_rois_to_rois(roi[{{}, {2,5}}], input))
+
+    local perturbation = 1e-3
+    local jac_fprop = jac.forward(module, input, input, 1e-3) 
+    --module:forward(input)
+    local jac_bprop = jac.backward(module, input)
+ 
+    --print('---1111111111111111111111111111')
+    print(jac_fprop)
+    print('---2222222222222222222222222222')
+    print(jac_bprop)   
+   
+    local err = jac.testJacobian(module, input, -1, 1, 1e-3)
+    mytester:assertlt(err, precision, 'error on ROIWarping ')
+  end
+end
+
+function inntest.ROIWarpingDeltaROI()
+  local FixedROIWarpingDeltaROI, parent = torch.class('FixedROIWarpingDeltaROI', 'inn.ROIWarping')
+  function FixedROIWarpingDeltaROI:__init(W, H, s, roi, img)
+    self.img = img
+    self.roi = roi
+    self.delta_roi = self.roi:clone()
+    parent.__init(self, W, H, s)
+    self:cuda()
+  end
+
+  function FixedROIWarpingDeltaROI:updateOutput(input)
+    self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi)
+    return parent.updateOutput(self,{self.img:cuda(), self.roi:cuda(), self.delta_roi:cuda()})
+  end
+  function FixedROIWarpingDeltaROI:updateGradInput(input, gradOutput)
+    self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi) 
+    return parent.updateGradInput(self,{self.img:cuda(), self.roi:cuda(), self.delta_roi:cuda()}, gradOutput)[3][{{}, {2, 5}}]
+  end
+
+  testJacobianWithRandomROIForROIWarpingDeltaROI(FixedROIWarpingDeltaROI)
+end
+]]
+----------------------------------------------------------------------
+function testJacobianWithRandomROIForROIWarpingGridGenerator(cls)
+  --pooling grid size
+  local w=4;
+  local h=4;
+  --img size
+  local W=w*2;
+  local H=h*2;
+
+  local batchSize = 3
+  local numRoi = batchSize
+  local numRepeat = 3
+
+  torch.manualSeed(0)
+  for i=1,numRepeat do
+    local img = torch.rand(batchSize, 3, H, W);
+    --local roi = torch.Tensor{1, 1, 1, W, H}:reshape(1, 5)
+    local roi = randROI(img:size(), numRoi)
+    local input = torch.rand(numRoi, 4)
+    local module = cls.new(h, w, roi)
+
+    local perturbation = 1e-3
+    local jac_fprop = jac.forward(module, input, input, 1e-3)
+    local jac_bprop = jac.backward(module, input)
+ 
+    local err = jac.testJacobian(module, input, -1, 1, 1e-3)
+    mytester:assertlt(err, precision, 'error on ROIWarping ')
+  end
+end
+
+function inntest.ROIWarpingGridGeneratorGridCtrs()
+  local FixedROIWarpingGridGeneratorGridCtrs, parent = torch.class('FixedROIWarpingGridGeneratorGridCtrs', 'inn.ROIWarpingGridGenerator')
+  function FixedROIWarpingGridGeneratorGridCtrs:__init(H, W, roi)
+    parent.__init(self, H, W)
+    self.roi = roi
+    self.delta_roi = self.roi:clone()
+    self.grad_bin_sizes = torch.zeros(roi:size(1), 2)
+    self:cuda()
+  end
+
+  function FixedROIWarpingGridGeneratorGridCtrs:updateOutput(input)
+    self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi)
+    local tmp = parent.updateOutput(self, {self.roi:cuda(), self.delta_roi:cuda()})
+    self.output = self.output or input:cuda().new()
+    self.output:resizeAs(tmp[1]):copy(tmp[1])
+    return self.output
+  end
+  function FixedROIWarpingGridGeneratorGridCtrs:updateGradInput(input, gradOutput)
+    self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi)
+    self.gradInput = parent.updateGradInput(self,{self.roi:cuda(), self.delta_roi:cuda()}, {gradOutput, self.grad_bin_sizes:cuda()})
+    return self.gradInput[2][{{}, {2, 5}}]
+  end
+
+  testJacobianWithRandomROIForROIWarpingGridGenerator(FixedROIWarpingGridGeneratorGridCtrs)
+end
+
+function inntest.ROIWarpingGridGeneratorBinSizes()
+  local FixedROIWarpingGridGeneratorBinSizes, parent = torch.class('FixedROIWarpingGridGeneratorBinSizes', 'inn.ROIWarpingGridGenerator')
+  function FixedROIWarpingGridGeneratorBinSizes:__init(H, W, roi)
+    parent.__init(self, H, W)
+    self.roi = roi
+    self.delta_roi = self.roi:clone()
+    self.grad_grid_ctrs = torch.zeros(roi:size(1), H, W, 2)
+    self:cuda()
+  end
+
+  function FixedROIWarpingGridGeneratorBinSizes:updateOutput(input)
+    self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi)
+    local tmp = parent.updateOutput(self, {self.roi:cuda(), self.delta_roi:cuda()})
+    self.output = self.output or input:cuda().new()
+    self.output:resizeAs(tmp[2]):copy(tmp[2])
+    return self.output
+  end
+  function FixedROIWarpingGridGeneratorBinSizes:updateGradInput(input, gradOutput)
+    self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi)
+    self.gradInput = parent.updateGradInput(self,{self.roi:cuda(), self.delta_roi:cuda()}, {self.grad_grid_ctrs:cuda(), gradOutput})
+    return self.gradInput[2][{{}, {2, 5}}]
+  end
+
+  testJacobianWithRandomROIForROIWarpingGridGenerator(FixedROIWarpingGridGeneratorBinSizes)
+end
+
+function testJacobianWithRandomROIForROIWarpingBilinearSampleData(cls)
+  --pooling grid size
+  local w=4;
+  local h=4;
+  --input size
+  local W=w*2;
+  local H=h*2;
+
+  local batchSize = 3
+  local numRoi = batchSize
+  local numRepeat = 3
+
+  torch.manualSeed(0)
+  for i=1,numRepeat do
+    local input = torch.rand(batchSize, 1, H, W);
+    local roi = randROI(input:size(), numRoi)
+    local delta_roi = roi:clone()
+    delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4)
+    local pred_rois = delta_rois_to_rois(roi[{{}, {2, 5}}], delta_roi[{{}, {2, 5}}])
+
+    local preprocess = inn.ROIWarpingGridGenerator(h, w); preprocess:cuda()
+    local output = preprocess:forward({roi:cuda(), delta_roi:cuda()})
+    local grid_ctrs = output[1]:clone()
+    local bin_sizes = output[2]:clone()
+    local roi_batch_inds = output[3]:clone() 
+
+    local module = cls.new(h, w, grid_ctrs, bin_sizes, roi_batch_inds)
+    local err = jac.testJacobian(module, input, nil, nil, 1e-3)
+    mytester:assertlt(err, precision, 'error on ROIWarpingBilinearSampleData ')
+  end
+end
+
+function inntest.ROIWarpingBlinearSampleData()
+  local FixedROIWarpingBilinearSampleData, parent = torch.class('FixedROIWarpingBilinearSampleData', 'inn.ROIWarpingBilinearSample')
+  function FixedROIWarpingBilinearSampleData:__init(H, W, grid_ctrs, bin_sizes, roi_batch_inds)
+    self.grid_ctrs = grid_ctrs:clone() 
+    self.bin_sizes = bin_sizes:clone() 
+    self.roi_batch_inds = roi_batch_inds:clone()
+    parent.__init(self, H, W)
+    self:cuda()
+  end
+
+  function FixedROIWarpingBilinearSampleData:updateOutput(input)
+    return parent.updateOutput(self, {input:cuda(), self.grid_ctrs:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()})
+  end
+  function FixedROIWarpingBilinearSampleData:updateGradInput(input, gradOutput)
+    return parent.updateGradInput(self, {input:cuda(), self.grid_ctrs:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[1]
+  end
+
+  testJacobianWithRandomROIForROIWarpingBilinearSampleData(FixedROIWarpingBilinearSampleData)
+end
+
+function testJacobianWithRandomROIForROIWarpingBilinearSampleGridCtrs(cls)
+  --pooling grid size
+  local w=4;
+  local h=4;
+  --input size
+  local W=w*2;
+  local H=h*2;
+
+  local batchSize = 3
+  local numRoi = batchSize
+  local numRepeat = 3
+
+  torch.manualSeed(0)
+  for i=1,numRepeat do
+    local img = torch.rand(batchSize, 1, H, W);
+    local roi = randROI(img:size(), numRoi)
+    local delta_roi = roi:clone()
+    delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4)
+    local pred_rois = delta_rois_to_rois(roi[{{}, {2, 5}}], delta_roi[{{}, {2, 5}}])
+
+    local preprocess = inn.ROIWarpingGridGenerator(h, w); preprocess:cuda()
+    local output = preprocess:forward({roi:cuda(), delta_roi:cuda()})
+    local input = output[1]:clone() -- local grid_ctrs = output[1]:clone()
+    local bin_sizes = output[2]:clone()
+    local roi_batch_inds = output[3]:clone() 
+
+    --print(input:select(4,1))
+    --print(input:select(4,2))
+    print(roi)
+    print(delta_roi)
+    print(pred_rois)
+    --print(grid_ctrs:select(4, 1))
+    --print(grid_ctrs:select(4, 2))
+    --print(bin_sizes)
+    --print(roi_batch_inds)
+
+    local module = cls.new(h, w, img, bin_sizes, roi_batch_inds)
+
+    local err = jac.testJacobian(module, input, nil, nil, 1e-3)
+    -- for debug
+    --if err > precision then 
+    --  local jac_fprop = jac.forward(module, input, nil, 1e-3)
+    --  local jac_bprop = jac.backward(module, input)
+    --  print(jac_fprop)
+    --  print(jac_bprop)
+    --  local err = jac_fprop-jac_bprop
+    --  local val, index = torch.max(err:view(-1):abs(), 1)
+    --  print(val)
+    --  print(index)
+    --  print(input:numel())
+    --  print(input:size())
+    --  print(pred_rois)
+    --  --print(input)
+    --  print(input:select(4,1))
+    --  print(grid_ctrs:select(4,1))
+    --  --print(input:select(4,2))
+    --end
+    -- til here 
+    mytester:assertlt(err, precision, 'error on ROIWarpingBilinearSample ')
+  end
+end
+
+function inntest.ROIWarpingBlinearSampleGridCtrs()
+  local FixedROIWarpingBilinearSampleGridCtrs, parent = torch.class('FixedROIWarpingBilinearSampleGridCtrs', 'inn.ROIWarpingBilinearSample')
+  function FixedROIWarpingBilinearSampleGridCtrs:__init(H, W, img, bin_sizes, roi_batch_inds)
+    self.img = img:clone()
+    self.bin_sizes = bin_sizes:clone()
+    self.roi_batch_inds = roi_batch_inds:clone()
+    parent.__init(self, H, W)
+    self:cuda()
+  end
+
+  function FixedROIWarpingBilinearSampleGridCtrs:updateOutput(input)
+    return parent.updateOutput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()})
+  end
+  function FixedROIWarpingBilinearSampleGridCtrs:updateGradInput(input, gradOutput)
+    return parent.updateGradInput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[2]
+  end
+
+  testJacobianWithRandomROIForROIWarpingBilinearSampleGridCtrs(FixedROIWarpingBilinearSampleGridCtrs)
+end
+--[[
+function testJacobianWithRandomROIForROIWarpingBilinearSampleBinSizes(cls)
+
+  --pooling grid size
+  local w=4;
+  local h=4;
+  --input size
+  local W=w*2;
+  local H=h*2;
+
+  local batchSize = 3
+  local numRoi = batchSize
+  local numRepeat = 3
+
+  torch.manualSeed(0)
+  for i=1,numRepeat do
+    local img = torch.rand(batchSize, 1, H, W);
+    local roi = randROI(img:size(), numRoi)
+    local delta_roi = roi:clone()
+    delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4)
+    local pred_rois = delta_rois_to_rois(roi[{{}, {2, 5}}], delta_roi[{{}, {2, 5}}])
+
+    local preprocess = inn.ROIWarpingGridGenerator(h, w); preprocess:cuda()
+    local output = preprocess:forward({roi:cuda(), delta_roi:cuda()})
+    local grid_ctrs = output[1]:clone()
+    local bin_sizes = output[2]:clone()
+    local roi_batch_inds = output[3]:clone()
+
+    local input = bin_sizes:clone()
+
+    --print(input:select(4,1))
+    --print(input:select(4,2))
+    --print(roi)
+    --print(delta_roi)
+    --print(pred_rois)
+    --print(grid_ctrs:select(4, 1))
+    --print(grid_ctrs:select(4, 2))
+    --print(bin_sizes)
+    --print(roi_batch_inds)
+
+    local module = cls.new(h, w, img, grid_ctrs, roi_batch_inds)
+
+    local err = jac.testJacobian(module, input, nil, nil, 1e-3)
+    -- for debug
+    if err > precision then
+      local jac_fprop = jac.forward(module, input, nil, 1e-3)
+      local jac_bprop = jac.backward(module, input)
+      print(jac_fprop)
+      print(jac_bprop)
+      local err = jac_fprop-jac_bprop
+      local val, index = torch.max(err:view(-1):abs(), 1)
+      print(val)
+      print(index)
+      print(input:numel())
+      print(input:size())
+      print(pred_rois)
+      print(input)
+      print(bin_sizes)
+    end
+    -- til here
+    mytester:assertlt(err, precision, 'error on ROIWarpingBilinearSample ')
+  end
+end
+
+function inntest.ROIWarpingBlinearSampleBinSizes()
+  local FixedROIWarpingBilinearSampleBinSizes, parent = torch.class('FixedROIWarpingBilinearSampleBinSizes', 'inn.ROIWarpingBilinearSample')
+  function FixedROIWarpingBilinearSampleBinSizes:__init(H, W, img, grid_ctrs, roi_batch_inds)
+    self.img = img:clone()
+    self.grid_ctrs = grid_ctrs:clone()
+    self.roi_batch_inds = roi_batch_inds:clone()
+    parent.__init(self, H, W)
+    self:cuda()
+  end
+
+  function FixedROIWarpingBilinearSampleBinSizes:updateOutput(input)
+    return parent.updateOutput(self, {self.img:cuda(), self.grid_ctrs:cuda(), input:cuda(), self.roi_batch_inds:cuda()})
+  end
+  function FixedROIWarpingBilinearSampleBinSizes:updateGradInput(input, gradOutput)
+    return parent.updateGradInput(self, {self.img:cuda(), self.grid_ctrs:cuda(), input:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[3]
+  end
+
+  testJacobianWithRandomROIForROIWarpingBilinearSampleBinSizes(FixedROIWarpingBilinearSampleBinSizes)
+end
+]]
+
 jac = nn.Jacobian
 mytester:add(inntest)
 mytester:run()
diff --git a/test/test_roiwarping.lua b/test/test_roiwarping.lua
new file mode 100644
index 0000000..ab4d3d1
--- /dev/null
+++ b/test/test_roiwarping.lua
@@ -0,0 +1,297 @@
+local function delta_rois_to_rois(rois, delta_rois)
+  local src_w = rois[{{},3}] - rois[{{},1}] + 1;
+  local src_h = rois[{{},4}] - rois[{{},2}] + 1;
+  local src_ctr_x = rois[{{},1}] + 0.5*(src_w-1.0);
+  local src_ctr_y = rois[{{},2}] + 0.5*(src_h-1.0);
+
+  local dst_ctr_x = delta_rois[{{},1}]; -- dx (in fast-rcnn notation) = cx (in here)
+  local dst_ctr_y = delta_rois[{{},2}]; -- dy (in fast-rcnn notation) = cy (in here)
+  local dst_scl_x = delta_rois[{{},3}]; -- dw (in fast-rcnn notation) = sx (in here)
+  local dst_scl_y = delta_rois[{{},4}]; -- dh (in fast-rcnn notation) = sy (in here)
+
+  local pred_ctr_x = torch.cmul(dst_ctr_x, src_w) + src_ctr_x; 
+  local pred_ctr_y = torch.cmul(dst_ctr_y, src_h) + src_ctr_y; 
+  local pred_w = torch.cmul(torch.exp(dst_scl_x), src_w);            
+  local pred_h = torch.cmul(torch.exp(dst_scl_y), src_h);            
+
+  local roi_start_w = pred_ctr_x - 0.5*(pred_w-1) 
+  local roi_start_h = pred_ctr_y - 0.5*(pred_h-1) 
+  local roi_end_w =   pred_ctr_x + 0.5*(pred_w-1) 
+  local roi_end_h =   pred_ctr_y + 0.5*(pred_h-1) 
+
+  return torch.cat({roi_start_w, roi_start_h, roi_end_w, roi_end_h}, 2)
+end
+
+local inn = require 'inn'
+local nn = require 'nn'
+
+torch.manualSeed(3)
+
+local n_images = 1 -- 2 
+local channels = 3
+local H = 4
+local W = 3
+local height = 3
+local width = 6
+
+local sz = torch.Tensor{channels, height, width}
+local input_image = torch.Tensor(n_images, sz[1], sz[2], sz[3]):copy(torch.linspace(1, n_images * sz[1] * sz[2] * sz[3], n_images * sz[1] * sz[2] * sz[3]):reshape(n_images, sz[1], sz[2], sz[3]))
+
+print(input_image)
+
+local n_rois = 1 --3 --10
+local rois=torch.Tensor(n_rois,5)
+for i=1,n_rois do
+  idx=torch.randperm(n_images)[1]
+  y=torch.randperm(sz[2])[{{1,2}}]:sort()
+  x=torch.randperm(sz[3])[{{1,2}}]:sort()
+  rois[{i,{}}] = torch.Tensor({idx,x[1],y[1],x[2],y[2]})
+  --rois[{i,{}}] = torch.Tensor({idx,1,1,sz[3],sz[2]})
+  --rois[{i,{}}] = torch.Tensor({idx,1,1,H/2,W/2})
+  --rois[{i,{}}] = torch.Tensor({idx,1,1,1,1})
+end
+--rois[{1,{}}] = torch.Tensor({1,2,2,3,5})
+--rois[{2,{}}] = torch.Tensor({1,1,5,3,6})
+--rois[{1,{}}] = torch.Tensor({1,1,5,3,6})
+--rois[{{}, {}}] = torch.Tensor({{1,  3,  1,  6,  5},
+--                               {3,  4,  4,  5,  8},
+--                               {2,  1,  1,  3,  5}})
+--print(rois)
+
+local model = inn.ROIPooling(W,H)
+model.v2 = false
+model:cuda()
+
+local output = model:forward({input_image:cuda(), rois:cuda()})
+--print(output)
+
+local model = inn.ROIWarping(W,H)
+model:cuda()
+--local output = model:forward({input_image:cuda(), rois:cuda()})
+--print(output)
+
+---------------
+print('-------------------------')
+local delta_rois = rois:clone()
+--delta_rois[{{}, {2,5}}] = 0 
+--delta_rois[{{}, {2,5}}] = torch.ones(n_rois, 4) * 0.1 
+--delta_rois[{{}, {2,5}}] = 0.1 * torch.rand(n_rois, 4)
+delta_rois[{{}, {2,5}}] = torch.rand(n_rois, 4)
+--delta_rois[{{}, {2,5}}] = torch.Tensor{0.7887, 0.4103, 0.7086, 0.7714}:reshape(1,4)
+--delta_rois[{{}, {2,5}}] = torch.Tensor{0.4694, 0.1311, 0.8265, 0.1495, 0.9336, 0.4434, 0.5211, 0.1230}:reshape(2,4)
+--delta_rois[{{}, {2,5}}] = torch.Tensor{0.4694, 0.1311, 0.8265, 0.1495}:reshape(1,4)
+--delta_rois[{{}, {2,5}}] = torch.Tensor{0.9336, 0.4434, 0.5211, 0.1230}:reshape(1,4)
+--delta_rois[{{}, {}}] = torch.Tensor({{1.0000, 0.7253, 0.6597, 0.5013, 0.5332},
+--                                     {3.0000, 0.9561, 0.2305, 0.6440, 0.3949},
+--                                     {2.0000, 0.4239, 0.6188, 0.6064, 0.4749}})
+--
+print(rois)
+print(delta_rois)
+local pred_rois = delta_rois_to_rois(rois[{{}, {2,5}}], delta_rois[{{}, {2,5}}])
+print(pred_rois)
+print(torch.round(pred_rois))
+--[[
+local output = model:forward({input_image:cuda(), rois:cuda(), delta_rois:cuda()})
+--local output = model:forward({input_image:clone():fill(1):cuda(), rois:cuda(), delta_rois:cuda()})
+print(output)
+print(output:sum())
+
+print('-------------------------')
+local gradOutput = torch.ones(n_rois, channels, H, W):cuda() --torch.rand(n_rois, channels, H, W):cuda() --torch.Tensor(n_rois, channels, 3, 3):fill(1)
+local gradInput = model:backward({input_image:cuda(), rois:cuda(), delta_rois:cuda()}, gradOutput)
+--local gradInput = model:backward({input_image:clone():fill(1):cuda(), rois:cuda(), delta_rois:cuda()}, gradOutput)
+print(gradInput[1])
+print(gradInput[1]:sum())
+print(gradInput[2])
+print(gradInput[3])
+print(gradInput[3]:sum())
+]]
+
+--print('------------------------------------------------------------')
+local model = inn.ROIWarpingGridGenerator(H, W)
+model:cuda()
+local output = model:forward({rois:cuda(), delta_rois:cuda()})
+--print(output[1]:select(4,1))
+--print(output[1]:select(4,2))
+--print(output[2])
+--print(output[3])
+local grid_ctrs = output[1]:clone()
+local bin_sizes = output[2]:clone()
+local roi_batch_inds = output[3]:clone() 
+
+local gradOutput = {torch.ones(n_rois, H, W, 2):cuda(), --torch.rand(n_rois, channels, H, W, 2):cuda()
+                    torch.ones(n_rois, 2):cuda()} 
+local gradInput = model:backward({rois:cuda(), delta_rois:cuda()}, gradOutput)
+--print(gradInput[1])
+--print(gradInput[2])
+
+print('------------------------------------------------------------')
+--local input_image = 10 * torch.rand(input_image:size()) --torch.ones(input_image:size())
+local input_image = torch.ones(input_image:size())
+local model = inn.ROIWarpingBilinearSample(H, W)
+model:cuda()
+local output = model:forward({input_image:cuda(), grid_ctrs:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()})
+--print(output)
+--print(output:sum())
+
+--print('hi0000000000000000000000')
+local gradOutput = torch.ones(n_rois, channels, H, W):cuda()
+local gradInput = model:backward({input_image:cuda(), grid_ctrs:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()}, gradOutput)
+--print(gradInput)
+--print(gradInput[1])
+--print(gradInput[1]:sum())
+--print(gradInput[2]:select(4,1)/3)
+--print(gradInput[2]:select(4,2)/3)
+--print(gradInput[2]:sum())
+--print(gradInput[3])
+--print(gradInput[4])
+
+print('------------------------------------------------------------')
+local FixedROIWarpingBilinearSampleGridCtrs, parent = torch.class('FixedROIWarpingBilinearSampleGridCtrs', 'inn.ROIWarpingBilinearSample')
+function FixedROIWarpingBilinearSampleGridCtrs:__init(H, W, img, bin_sizes, roi_batch_inds)
+  self.img = img:clone()
+  self.bin_sizes = bin_sizes:clone()
+  self.roi_batch_inds = roi_batch_inds:clone()
+  parent.__init(self, H, W)
+  self:cuda()
+end
+
+function FixedROIWarpingBilinearSampleGridCtrs:updateOutput(input)
+  return parent.updateOutput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()})
+end
+function FixedROIWarpingBilinearSampleGridCtrs:updateGradInput(input, gradOutput)
+  return parent.updateGradInput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[2]
+end
+
+local jac = nn.Jacobian
+
+local module = FixedROIWarpingBilinearSampleGridCtrs.new(H, W, input_image, bin_sizes, roi_batch_inds) 
+local input = grid_ctrs:clone()
+
+local err = jac.testJacobian(module, input, nil, nil, 1e-3)
+
+local ind = 24 
+
+
+
+
+
+
+
+
+
+
+local function jacforward(module, input, param, perturbation)
+   param = param or input
+   -- perturbation amount
+   perturbation = perturbation or 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin = param.new(param):resize(param:nElement())--param.new(tst,1,tst:size())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement())
+
+   print('hi11')
+   print('input: ')
+   print(input)
+   print('param: ')
+   print(param)
+   print('sin: ') 
+   print(sin)
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+
+   for i=1,sin:nElement() do
+      local orig = sin[i]
+      if i == ind then 
+        print(orig)
+      end
+
+      sin[i] = orig - perturbation
+      outa:copy(module:forward(input))
+      if i == ind then 
+        print(sin[i])
+      end
+
+      sin[i] = orig + perturbation
+      outb:copy(module:forward(input))
+      if i == ind then 
+        print(sin[i])
+      end
+
+      sin[i] = orig
+
+      if i == ind then  
+        print(outa)
+        print(outb)
+        print(outb-outa)
+      end 
+
+      outb:add(-1,outa):div(2*perturbation)
+
+      if i == 5 then
+        print(outb)
+      end
+
+      jacobian:select(1,i):copy(outb)
+   end
+
+   return jacobian
+end
+
+
+--local input = grid_ctrs:clone()
+print(input)
+print(grid_ctrs)
+if err > 0.001 then
+  print('error!!!!!!!!!!!!!!!')
+  print(err)
+  --local jac_fprop = jac.forward(module, input, nil, 1e-3)
+  local jac_fprop = jacforward(module, input, nil, 1e-3)
+  local jac_bprop = jac.backward(module, input)   -- input:numel() x output:numel()
+  print(jac_fprop)
+  print(jac_bprop)
+  local err = jac_fprop-jac_bprop
+  local val, index = torch.max(err:view(-1):abs(), 1)
+  print(val)
+  print(index)
+  --print(input:numel()) --print(input:nElement())
+  print('input img: ')
+  print(input_image:size())
+  print(input_image)
+  print('pred_rois: ')
+  print(pred_rois)
+  print(bin_sizes)
+  --print(input)
+  --print(input:select(4,1))
+  --print(input:select(4,2))
+  --print(grid_ctrs:select(4,1))
+  --print(grid_ctrs:select(4,2))
+  --print(grid_ctrs:view(-1))
+  --print(grid_ctrs:view(-1):select(1,4))
+
+  print('test error!!!!!!!!!!!!!!!!!!!!!!!!!!')
+  local tmp = input:clone() --grid_ctrs:clone()
+  local tmp2 = tmp:view(-1)
+  local orig = tmp2[ind]
+  --local ind = 5
+
+  tmp2[ind] = orig + 1e-3 
+  local output1 = model:forward({input_image:cuda(), tmp:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()}):clone()
+  print('grid_ctrs1: ')
+  print(tmp:select(4,1))
+  print(tmp:select(4,2))
+  print('output1: ')
+  print(output1)
+
+  tmp2[ind] = orig - 1e-3 
+  local output2 = model:forward({input_image:cuda(), tmp:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()})
+  print('grid_ctrs2: ')
+  print(tmp:select(4,1))
+  print(tmp:select(4,2))
+  print('output2: ')
+  print(output2)
+  print(torch.sqrt(torch.pow(output1:view(-1) - output2:view(-1),2)))
+  print(torch.sum(torch.sqrt(torch.pow(output1:view(-1) - output2:view(-1),2))))
+end