diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..813e4f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.sw* +*.bak* +build +build/* diff --git a/ROIWarping.cu b/ROIWarping.cu new file mode 100644 index 0000000..23518ca --- /dev/null +++ b/ROIWarping.cu @@ -0,0 +1,595 @@ +// ------------------------------------------------------------------ +// Fast R-CNN +// Copyright (c) 2015 Microsoft +// Licensed under The MIT License [see fast-rcnn/LICENSE for details] +// Written by Ross Girshick +// ------------------------------------------------------------------ + +// Torch port: +// IMAGINE, Sergey Zagoruyko, Francisco Massa, 2015 + +#include "THC.h" +#include +#include +#include "assert.h" + +#include "common.h" + + +using std::max; +using std::min; + + +template +__global__ void ROIWarpForward(const int nthreads, const Dtype* bottom_data, + const Dtype spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const Dtype* bottom_rois, const Dtype* bottom_delta_rois, Dtype* top_data, Dtype* top_data_buffer) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + bottom_rois += n * 5; + int roi_batch_ind = (bottom_rois[0] - 1); + //int roi_start_w = round((bottom_rois[1] - 1) * spatial_scale); + //int roi_start_h = round((bottom_rois[2] - 1)* spatial_scale); + //int roi_end_w = round((bottom_rois[3] - 1) * spatial_scale); + //int roi_end_h = round((bottom_rois[4] - 1) * spatial_scale); + + Dtype src_w = bottom_rois[3] - bottom_rois[1] + 1; + Dtype src_h = bottom_rois[4] - bottom_rois[2] + 1; + Dtype src_ctr_x = bottom_rois[1] + 0.5*(src_w-1.0); + Dtype src_ctr_y = bottom_rois[2] + 0.5*(src_h-1.0); + + Dtype dst_ctr_x = bottom_delta_rois[1]; // dx (in fast-rcnn notation) = cx (in here) + Dtype dst_ctr_y = bottom_delta_rois[2]; // dy (in fast-rcnn notation) = cy (in here) + Dtype dst_scl_x = bottom_delta_rois[3]; // dw (in fast-rcnn notation) = sx (in here) + Dtype dst_scl_y = bottom_delta_rois[4]; // dh (in fast-rcnn notation) = sy (in here) + + Dtype pred_ctr_x = dst_ctr_x * src_w + src_ctr_x; + Dtype pred_ctr_y = dst_ctr_y * src_h + src_ctr_y; + Dtype pred_w = exp(dst_scl_x) * src_w; + Dtype pred_h = exp(dst_scl_y) * src_h; + + Dtype roi_start_w = ( (pred_ctr_x - 0.5*(pred_w-1)) - 1 ) * spatial_scale; + Dtype roi_start_h = ( (pred_ctr_y - 0.5*(pred_h-1)) - 1 ) * spatial_scale; + Dtype roi_end_w = ( (pred_ctr_x + 0.5*(pred_w-1)) - 1 ) * spatial_scale; + Dtype roi_end_h = ( (pred_ctr_y + 0.5*(pred_h-1)) - 1 ) * spatial_scale; + assert(roi_end_w - roi_start_w >= 0); + assert(roi_end_h - roi_start_h >= 0); + + // Force malformed ROIs to be 1x1 + Dtype roi_width = roi_end_w - roi_start_w + 1; + Dtype roi_height = roi_end_h - roi_start_h + 1; + + Dtype bin_size_w = roi_width / static_cast(pooled_width); + Dtype bin_size_h = roi_height / static_cast(pooled_height); + Dtype wstart_ = static_cast(pw) * bin_size_w + roi_start_w; + Dtype hstart_ = static_cast(ph) * bin_size_h + roi_start_h; + Dtype wend_ = static_cast(pw+1) * bin_size_w + roi_start_w; + Dtype hend_ = static_cast(ph+1) * bin_size_h + roi_start_h; + + int wstart = static_cast(floor(wstart_)); + int hstart = static_cast(floor(hstart_)); + int wend = static_cast( ceil(wend_)); + int hend = static_cast( ceil(hend_)); + + Dtype wctr = (wend_ + wstart_) * 0.5; // dwctr / dwe = 0.5; dwctr / dws = 0.5 + Dtype hctr = (hend_ + hstart_) * 0.5; // dhctr / dhe = 0.5; dhctr / dhs = 0.5 + Dtype wdiff = (wend_ - wstart_) + 1; // dwdiff / dwe = 1; dwdiff / dws = -1 + Dtype hdiff = (hend_ - hstart_) + 1; // dhdiff / dhe = 1; dhdiff / dhs = -1 + + //top_data[index] = static_cast(hend-1-hstart)+1; + //top_data[index] = hend; //wend; + //top_data[index] = hstart+1; // wstart+1; + //top_data[index] = wdiff; + //top_data[index] = hctr+1; + //top_data[index] = wctr+1; + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + + //top_data[index] = hstart+1; + //top_data[index] = wstart+1; + + // Auxilliary variables used in backprop + Dtype w_mask = 0, h_mask = 0; + Dtype dgx_final_dwctr_all = 0; + Dtype dgx_final_dwdiff_all = 0; + Dtype dgy_final_dhctr_all = 0; + Dtype dgy_final_dhdiff_all = 0; + // Define an empty pooling region to be zero + Dtype val = 0; Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0; + bottom_data += (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + Dtype h_ = h; + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + Dtype w_ = w; + //gain_x = max(0., 1 - abs( dst_ctr_x + static_cast(pw) / static_cast(pooled_width) * dst_scl_x - w )); -- in paper, but makes no sense + //gain_y = max(0., 1 - abs( dst_ctr_y + static_cast(ph) / static_cast(pooled_height) * dst_scl_y - h)); + gain_x = wdiff - abs((w_ - wctr)); + gain_y = hdiff - abs((h_ - hctr)); + gain = gain_x * gain_y; + + val = val + gain * bottom_data[bottom_index]; + //val = val + gain; + //val = val + 1; + + if (h == hstart) { + gain_x_all = gain_x_all + gain_x; + + // Update information used in backprop + w_mask = w_ >= wctr ? 1 : -1; + dgx_final_dwctr_all = dgx_final_dwctr_all + w_mask; + dgx_final_dwdiff_all = dgx_final_dwdiff_all + 1; + } + } + gain_y_all = gain_y_all + gain_y; + + h_mask = h >= hctr ? 1 : -1; + dgy_final_dhctr_all = dgy_final_dhctr_all + h_mask; + dgy_final_dhdiff_all = dgy_final_dhdiff_all + 1; + } + if (gain_x_all > 1e-10) + val = val / gain_x_all; + if (gain_y_all > 1e-10) + val = val / gain_y_all; + top_data[index] = val; + + //top_data[index] = gain_x_all; + //top_data[index] = gain_y_all; + int buffer_index = n * (channels * pooled_height * pooled_width * 10) + c * (pooled_height * pooled_width * 10) + ph * (pooled_width * 10) + pw * 10; + top_data_buffer[buffer_index+0] = wctr; + top_data_buffer[buffer_index+1] = wdiff; + top_data_buffer[buffer_index+2] = hctr; + top_data_buffer[buffer_index+3] = hdiff; + top_data_buffer[buffer_index+4] = gain_x_all; + top_data_buffer[buffer_index+5] = gain_y_all; + top_data_buffer[buffer_index+6] = dgx_final_dwctr_all; + top_data_buffer[buffer_index+7] = dgy_final_dhctr_all; + top_data_buffer[buffer_index+8] = dgx_final_dwdiff_all; + top_data_buffer[buffer_index+9] = dgy_final_dhdiff_all; + } +} + +extern "C" +void inn_ROIWarping_updateOutput(THCState *state, + THCudaTensor *output, THCudaTensor *output_buffer, + THCudaTensor *data, THCudaTensor* rois, THCudaTensor* delta_rois, int W, int H, double spatial_scale) +{ + THAssert(THCudaTensor_nDimension(state, data) == 4); + THAssert(THCudaTensor_nDimension(state, rois) == 2 && rois->size[1] == 5); + THAssert(THCudaTensor_nDimension(state, delta_rois) == 2 && delta_rois->size[1] == 5); + THAssert(THCudaTensor_nDimension(state, rois) == THCudaTensor_nDimension(state, delta_rois) && + rois->size[0] == delta_rois->size[0] && + rois->size[1] == delta_rois->size[1]); + THAssert(THCudaTensor_isContiguous(state, data)); + THAssert(THCudaTensor_isContiguous(state, rois)); + THAssert(THCudaTensor_isContiguous(state, delta_rois)); + long num_rois = rois->size[0]; + long nInputPlane = data->size[1]; + THCudaTensor_resize4d(state, output, num_rois, nInputPlane, H, W); + THCudaTensor_resize5d(state, output_buffer, num_rois, nInputPlane, H, W, 10); + THCudaTensor_zero(state, output_buffer); + + long count = THCudaTensor_nElement(state, output); + + ROIWarpForward<<>>( + count, + THCudaTensor_data(state, data), + spatial_scale, nInputPlane, data->size[2], data->size[3], H, W, + THCudaTensor_data(state, rois), + THCudaTensor_data(state, delta_rois), + THCudaTensor_data(state, output), + THCudaTensor_data(state, output_buffer) + ); + + // check for errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("error in inn_ROIWarping_updateOutput: %s\n", cudaGetErrorString(err)); + THError("aborting"); + } +} + +template +__global__ void ROIWarpBackwardData(const int nthreads, const Dtype* top_data_buffer, + const Dtype spatial_scale, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int nth_roi, + const Dtype* bottom_rois, const Dtype* bottom_delta_rois, + const Dtype* top_diff, + Dtype* bottom_diff_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + + // (n, c, h, w) is an element in the input + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + bottom_rois += nth_roi * 5; + int roi_batch_ind = (bottom_rois[0] - 1); + + if (roi_batch_ind == n) { + + Dtype src_w = bottom_rois[3] - bottom_rois[1] + 1; + Dtype src_h = bottom_rois[4] - bottom_rois[2] + 1; + Dtype src_ctr_x = bottom_rois[1] + 0.5*(src_w-1.0); + Dtype src_ctr_y = bottom_rois[2] + 0.5*(src_h-1.0); + + Dtype dst_ctr_x = bottom_delta_rois[1]; // dx (in fast-rcnn notation) = cx (in here) + Dtype dst_ctr_y = bottom_delta_rois[2]; // dy (in fast-rcnn notation) = cy (in here) + Dtype dst_scl_x = bottom_delta_rois[3]; // dw (in fast-rcnn notation) = sx (in here) + Dtype dst_scl_y = bottom_delta_rois[4]; // dh (in fast-rcnn notation) = sy (in here) + + Dtype pred_ctr_x = dst_ctr_x * src_w + src_ctr_x; // dpcx / dcx = src_w + Dtype pred_ctr_y = dst_ctr_y * src_h + src_ctr_y; // dpcy / dcy = src_h + Dtype pred_w = exp(dst_scl_x) * src_w; // dpw / dsx = src_w * exp(dsx) + Dtype pred_h = exp(dst_scl_y) * src_h; // dph / dsy = src_h * exp(dsy) + + Dtype roi_start_w = ( (pred_ctr_x - 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drsw / dpcx = spatial_scale; drsw / dpw = -0.5 * spatial_scale + Dtype roi_start_h = ( (pred_ctr_y - 0.5*(pred_h-1)) - 1 ) * spatial_scale; // drsh / dpcy = spatial_scale; drsh / dph = -0.5 * spatial_scale + Dtype roi_end_w = ( (pred_ctr_x + 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drew / dpcx = spatial_scale; drew / dpw = 0.5 * spatial_scale + Dtype roi_end_h = ( (pred_ctr_y + 0.5*(pred_h-1)) - 1 ) * spatial_scale; // dreh / dpcy = spatial_scale; dreh / dph = 0.5 * spatial_scale + assert(roi_end_w - roi_start_w >= 0); + assert(roi_end_h - roi_start_h >= 0); + + Dtype roi_width = roi_end_w - roi_start_w + 1; + Dtype roi_height = roi_end_h - roi_start_h + 1; + + Dtype bin_size_pw = static_cast(pooled_width) / roi_width; + Dtype bin_size_ph = static_cast(pooled_height) / roi_height; + + int pwstart = static_cast(floor(static_cast(-roi_start_w + w) * bin_size_pw)); + int phstart = static_cast(floor(static_cast(-roi_start_h + h) * bin_size_ph)); + int pwend = static_cast(ceil(static_cast(-roi_start_w + w+1) * bin_size_pw)); + int phend = static_cast(ceil(static_cast(-roi_start_h + h+1) * bin_size_ph)); + + //bottom_diff_data[index] = pwend; //phend; + //bottom_diff_data[index] = pwstart+1; //phend; + + // Clip to top boundaries + phstart = min(max(phstart, 0), pooled_height); + phend = min(max(phend, 0), pooled_height); + pwstart = min(max(pwstart, 0), pooled_width); + pwend = min(max(pwend, 0), pooled_width); + + Dtype w_ = w, h_ = h; + Dtype wctr = 0, wdiff = 0, hctr = 0, hdiff = 0; + Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + int top_index = nth_roi * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width + pw; + int top_buffer_index = nth_roi * (channels * pooled_height * pooled_width * 10) + c * (pooled_height * pooled_width * 10) + ph * (pooled_width * 10) + pw * 10; + wctr = top_data_buffer[top_buffer_index+0]; + wdiff = top_data_buffer[top_buffer_index+1]; + hctr = top_data_buffer[top_buffer_index+2]; + hdiff = top_data_buffer[top_buffer_index+3]; + gain_x_all = top_data_buffer[top_buffer_index+4]; + gain_y_all = top_data_buffer[top_buffer_index+5]; + + gain_x = wdiff - abs((w_ - wctr)); // dgx / dwdiff = 1 + // dgx / dwctr = 1 ( if w >= wctr ) + // dgx / dwctr = - 1 ( else ) + gain_y = hdiff - abs((h_ - hctr)); // dgy / dhdiff = 1 + // dgy / dhctr = 1 ( if h >= hctr ) + // dgy / dhctr = - 1 ( else ) + if (gain_x_all > 1e-10) + gain_x = gain_x / gain_x_all; + if (gain_y_all > 1e-10) + gain_y = gain_y / gain_y_all; + + gain = gain_x * gain_y; + bottom_diff_data[index] = bottom_diff_data[index] + gain * top_diff[top_index]; //val = val + gain * bottom_data[bottom_index]; + } + } + } + } +} + +template +__global__ void ROIWarpBackwardDeltaROI(const int nthreads, const Dtype* top_data_buffer, + const Dtype spatial_scale, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const Dtype* bottom_rois, const Dtype* bottom_delta_rois, + const Dtype* top_diff, + const Dtype* bottom_data, + Dtype* bottom_diff_delta_rois_buffer) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + int buffer_index = n * (channels * pooled_height * pooled_width * 10) + c * (pooled_height * pooled_width * 10) + ph * (pooled_width * 10) + pw * 10; + Dtype wctr = top_data_buffer[buffer_index+0]; + Dtype wdiff = top_data_buffer[buffer_index+1]; + Dtype hctr = top_data_buffer[buffer_index+2]; + Dtype hdiff = top_data_buffer[buffer_index+3]; + Dtype gain_x_all = top_data_buffer[buffer_index+4]; + Dtype gain_y_all = top_data_buffer[buffer_index+5]; + Dtype dgx_final_dwctr_all = top_data_buffer[buffer_index+6]; + Dtype dgy_final_dhctr_all = top_data_buffer[buffer_index+7]; + Dtype dgx_final_dwdiff_all = top_data_buffer[buffer_index+8]; + Dtype dgy_final_dhdiff_all = top_data_buffer[buffer_index+9]; + + //if (gain_x_all > 1e-10 && gain_y_all > 1e-10) { + + bottom_rois += n * 5; + int roi_batch_ind = (bottom_rois[0] - 1); + + Dtype src_w = bottom_rois[3] - bottom_rois[1] + 1; + Dtype src_h = bottom_rois[4] - bottom_rois[2] + 1; + Dtype src_ctr_x = bottom_rois[1] + 0.5*(src_w-1.0); + Dtype src_ctr_y = bottom_rois[2] + 0.5*(src_h-1.0); + + Dtype dst_ctr_x = bottom_delta_rois[1]; // dx (in fast-rcnn notation) = cx (in here) + Dtype dst_ctr_y = bottom_delta_rois[2]; // dy (in fast-rcnn notation) = cy (in here) + Dtype dst_scl_x = bottom_delta_rois[3]; // dw (in fast-rcnn notation) = sx (in here) + Dtype dst_scl_y = bottom_delta_rois[4]; // dh (in fast-rcnn notation) = sy (in here) + + Dtype pred_ctr_x = dst_ctr_x * src_w + src_ctr_x; // dpcx / dcx = src_w + Dtype pred_ctr_y = dst_ctr_y * src_h + src_ctr_y; // dpcy / dcy = src_h + Dtype pred_w = exp(dst_scl_x) * src_w; // dpw / dsx = src_w * exp(dsx) + Dtype pred_h = exp(dst_scl_y) * src_h; // dph / dsy = src_h * exp(dsy) + + Dtype roi_start_w = ( (pred_ctr_x - 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drsw / dpcx = spatial_scale + // drsw / dpw = -0.5 * spatial_scale + Dtype roi_start_h = ( (pred_ctr_y - 0.5*(pred_h-1)) - 1 ) * spatial_scale; // drsh / dpcy = spatial_scale + // drsh / dph = -0.5 * spatial_scale + Dtype roi_end_w = ( (pred_ctr_x + 0.5*(pred_w-1)) - 1 ) * spatial_scale; // drew / dpcx = spatial_scale + // drew / dpw = 0.5 * spatial_scale + Dtype roi_end_h = ( (pred_ctr_y + 0.5*(pred_h-1)) - 1 ) * spatial_scale; // dreh / dpcy = spatial_scale + // dreh / dph = 0.5 * spatial_scale + assert(roi_end_w - roi_start_w >= 0); + assert(roi_end_h - roi_start_h >= 0); + + // drsw / dcx = drsw / dpcx * dpcx / dcx = spatial_scale * src_w + // drew / dcx = drew / dpcx * dpcx / dcx = spatial_scale * src_w + + // drsh / dcy = drsh / dpcy * dpcy / dcy = spatial_scale * src_h + // dreh / dcy = dreh / dpcy * dpcy / dcy = spatial_scale * src_h + + // drsw / dsx = drsw / dpw * dpw / dsx = -0.5 * spatial_scale * src_w * exp(dsx) + // drew / dsx = drew / dpw * dpw / dsx = 0.5 * spatial_scale * src_w * exp(dsx) + + // drsh / dsy = drsh / dph * dph / dsy = -0.5 * spatial_scale * src_h * exp(dsy) + // dreh / dsy = dreh / dph * dph / dsy = 0.5 * spatial_scale * src_h * exp(dsy) + + // Force malformed ROIs to be 1x1 + Dtype roi_width = roi_end_w - roi_start_w + 1; // drw / drew = 1 + // drw / drsw = -1 + Dtype roi_height = roi_end_h - roi_start_h + 1; // drh / dreh = 1 + // drh / drsh = -1 + // drw / dcx = drw / drew * drew / dcx + drw / drsw * drsw / dcx = drew / dcx - drsw / dcx + // = spatial_scale * src_w - spatial_scale * src_w = 0 + // drh / dcy = drh / dreh * dreh / dcy + drh / drsh * drsh / dcy = dreh / dcy - drsh / dcy = spatial_scale * src_h - spatial_scale * src_h = 0 + // drw / dsx = drw / drew * drew / dsx + drw / drsw * drsw / dsx = drew / dsx - drsw / dsx = 0.5 * spatial_scale * src_w * exp(dsx) - (-0.5 * spatial_scale * src_w * exp(dsx)) = spatial_scale * src_w * exp(dsx) + // drh / dsy = drh / dreh * dreh / dsy + drh / drsh * drsh / dsy = dreh / dsy - drsh / dsy = 0.5 * spatial_scale * src_h * exp(dsy) - (-0.5 * spatial_scale * src_h * exp(dsy)) = spatial_scale * src_h * exp(dsy) + + Dtype bin_size_w = roi_width / static_cast(pooled_width); // dbw / drw = 1 / pooled_width + Dtype bin_size_h = roi_height / static_cast(pooled_height); // dbh / drh = 1 / pooled_height + // dbw / dcx = dbw / drw * drw / dcx = 0 + // dbh / dcy = dbh / drh * drh / dcy = 0 + // dbw / dsx = dbw / drw * drw / dsx = 1 / pooled_width * spatial_scale * src_w * exp(dsx) + // dbh / dsy = dbh / drh * drh / dsy = 1 / pooled_height * spatial_scale * src_h * exp(dsy) + + Dtype wstart_ = static_cast(pw) * bin_size_w + roi_start_w; // ws = f(rsw, rew) + Dtype hstart_ = static_cast(ph) * bin_size_h + roi_start_h; // hw = f(rsh, reh) + Dtype wend_ = static_cast(pw+1) * bin_size_w + roi_start_w; // we = f(rsw, rew) + Dtype hend_ = static_cast(ph+1) * bin_size_h + roi_start_h; // he = f(rsh, reh) + // dws / dbw = pw + // dhs / dbh = ph + // dwe / dbw = (pw+1) + // dhe / dbh = (ph+1) + + int wstart = static_cast(floor(wstart_)); + int hstart = static_cast(floor(hstart_)); + int wend = static_cast( ceil(wend_)); + int hend = static_cast( ceil(hend_)); + + // dws / dcx = dws / drsw * drsw / dcx + dws / drew * drew / dcx + // = (dws / dbw * dbw / drsw + 1) * drsw / dcx + (dws / dbw * dbw / drew) * drew / dcx + // = (pw * 1 / pooled_width * (-1) + 1) * spatial_scale * src_w + // + (pw * 1 / pooled_width * ( 1) ) * spatial_scale * src_w + // = spatial_scale * src_w + // dwe / dcx = dwe / drsw * drsw / dcx + dwe / drew * drew / dcx + // = (dwe / dbw * dbw / drsw + 1) * drsw / dcx + (dwe / dbw * dbw / drew) * drew / dcx + // = ((pw+1) * 1 / pooled_width * (-1) + 1) * spatial_scale * src_w + // + ((pw+1) * 1 / pooled_width * ( 1) ) * spatial_scale * src_w + // = spatial_scale * src_w + + // dws / dsx = dws / drsw * drsw / dsx + dws / drew * drew / dsx + // = (dws / dbw * dbw / drsw + 1) * drsw / dsx + (dws / dbw * dbw / drew) * drew / dsx + // = (pw * 1 / pooled_width * (-1) + 1) * (-0.5 * spatial_scale * src_w * exp(dsx)) + // + (pw * 1 / pooled_width * ( 1) ) * ( 0.5 * spatial_scale * src_w * exp(dsx)) + // = (pw * 1 / pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) + // dwe / dsx = dwe / drsw * drsw / dsx + dwe / drew * drew / dsx + // = (dwe / dbw * dbw / drsw + 1) * drsw / dsx + (dwe / dbw * dbw / drew) * drew / dsx + // = ((pw+1) * 1 / pooled_width * (-1) + 1) * (-0.5 * spatial_scale * src_w * exp(dsx)) + // + ((pw+1) * 1 / pooled_width * ( 1) ) * ( 0.5 * spatial_scale * src_w * exp(dsx)) + // = ((pw+1) * 1 / pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) + + // dws / dcy = spatial_scale * src_h + // dwe / dcy = spatial_scale * src_h + // dws / dsy = ( ph * 1 / pooled_height - 1) * spatial_scale * src_h * exp(dsy) + // dwe / dsy = ((ph+1) * 1 / pooled_height - 1) * spatial_scale * src_h * exp(dsy) + /* + Dtype wctr = (wend_ + wstart_) * 0.5; // dwctr / dwe = 0.5; dwctr / dws = 0.5 + Dtype hctr = (hend_ + hstart_) * 0.5; // dhctr / dhe = 0.5; dhctr / dhs = 0.5 + Dtype wdiff = (wend_ - wstart_) + 1; // dwdiff / dwe = 1; dwdiff / dws = -1 + Dtype hdiff = (hend_ - hstart_) + 1; // dhdiff / dhe = 1; dhdiff / dhs = -1 + + // dwctr / dcx = dwctr / dwe * dwe / dcx + dwctr / dws * dws / dcx = 0.5 * spatial_scale * src_w + 0.5 * spatial_scale * src_w = spatial_scale * src_w + // dwdiff / dcx = dwdiff / dwe * dwe / dcx + dwdiff / dws * dws / dcx = 1 * spatial_scale * src_w - 1 * spatial_scale * src_w = 0 + // dhctr / dcy = spatial_scale * src_h + // dhdiff / dcy = 0 + + // dwctr / dsx = dwctr / dwe * dwe / dsx + dwctr / dws * dws / dsx + // = 0.5 * ((pw+1)/pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) + // + 0.5 * ( pw /pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) + // = 0.5 * ((2*pw+1)/pooled_width - 1) * spatial_scale * src_w * exp(dsx) + // = ((pw + 0.5) / pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) + // dwdiff / dsx = dwdiff / dwe * dwe / dsx + dwdiff / dws * dws / dsx + // = 1 * ((pw+1)/pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) + // + (-1) * ( pw /pooled_width - 0.5) * spatial_scale * src_w * exp(dsx) + // = (1 / pooled_width) * spatial_scale * src_w * exp(dsx) + // dhctr / dsy = ((ph + 0.5) / pooled_height - 0.5) * spatial_scale * src_h * exp(dsy) + // dhdiff / dsy = (1 / pooled_height) * spatial_scale * src_h * exp(dsy) + + + // dgx / dwctr = (w >= wctr ? 1 : -1) + // dgx / dwdiff = 1 + // dgy / dhctr = (h >= hctr ? 1 : -1) + // dgy / dhdiff = 1 + + // gx_final = gx / gx_all + // dgx_final / dwctr = ( dgx/dwctr * gx_all - gx * dgx_all/dwctr ) / (gx_all)^2 = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 + // dgx_final / dwdiff = ( dgx/dwdiff * gx_all - gx * dgx_all/dwdiff ) / (gx_all)^2 = ( 1 * gx_all - gx * sum_for_w{ 1 } ) / gx_all^2 + // gy_final = gy / gy_all + // dgy_final / dhctr = ... + // dgy_final / dhdiff = ... + + // dgx_final / dcx = dgx_final / dwctr * dwctr / dcx + dgx_final / dwdiff * dwdiff / dcx + // = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 * spatial_scale * src_w + (...) * 0 + // = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 * spatial_scale * src_w + // dgy_final / dcy = ( (h >= hctr ? 1 : -1) * gy_all - gy * sum_for_h{ (h >= hctr ? 1 : -1) } ) / gx_all^2 * spatial_scale * src_h + // dgx_final / dsx = ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 * ((pw + 0.5) - 0.5 * pooled_width) / pooled_width * spatial_scale * src_w * exp(dsx) + + // ( 1 * gx_all - gx * sum_for_w{ 1 } ) / gx_all^2 * 1 / pooled_width * spatial_scale * src_w * exp(dsx) + // dgy_final / dsy = ( (h >= hctr ? 1 : -1) * gy_all - gy * sum_for_h{ (h >= hctr ? 1 : -1) } ) / gy_all^2 * ((ph + 0.5) - 0.5 * pooled_height) / pooled_height * spatial_scale * src_h * exp(dsy) + + // ( 1 * gy_all - gy * sum_for_h{ 1 } ) / gy_all^2 * 1 / pooled_height * spatial_scale * src_h * exp(dsy) + + // dg / dcx = dg / dgx_final * dgx_final / dcx + dg / dgy_final * dgy_final / dcx + // = gy_final * dgx_final / dcx + gx_final * 0 + // = gy_final * dgx_final / dcx + // ... + */ + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + + // Define an empty pooling region to be zero + Dtype val_cx = 0, val_cy = 0, val_sx = 0, val_sy = 0; + Dtype gain_x = 0, gain_y = 0; + Dtype pw_ = static_cast(pw); + Dtype ph_ = static_cast(ph); + Dtype pooled_width_ = static_cast(pooled_width); + Dtype pooled_height_ = static_cast(pooled_height); + bottom_data += (roi_batch_ind * channels + c) * height * width; + Dtype w_mask = 0, h_mask = 0, coeff_x = 0, coeff_y = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + Dtype w_ = w, h_ = h; + gain_x = wdiff - abs((w_ - wctr)); + gain_y = hdiff - abs((h_ - hctr)); + + w_mask = w_ >= wctr ? 1 : -1; + h_mask = h_ >= hctr ? 1 : -1; + + //val_cx = val_cx + gain_y / gain_y_all * (w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) / (gain_x_all*gain_x_all) * spatial_scale * src_w * top_diff[index]; + //val_cy = val_cy + gain_x / gain_x_all * (h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) / (gain_y_all*gain_y_all) * spatial_scale * src_h * top_diff[index]; + //val_sx = val_sx + gain_y / gain_y_all *(( gain_x_all - gain_x * dgx_final_dwdiff_all) / (gain_x_all*gain_x_all) * (pw_+0.5-0.5*pooled_width) / pooled_width * spatial_scale * src_w * exp(dsx) + + // (w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) / (gain_x_all*gain_x_all) * 1 / pooled_width * spatial_scale * src_w * exp(dsx) ) * top_diff[index]; + //val_sy = val_sy + gain_x / gain_x_all *(( gain_y_all - gain_y * dgy_final_dhdiff_all) / (gain_y_all*gain_y_all) * (ph_+0.5-0.5*pooled_height)/ pooled_hidth * spatial_scale * src_h * eyp(dsy) + + // (h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) / (gain_y_all*gain_y_all) * 1 / pooled_hidth * spatial_scale * src_h * eyp(dsy) ) * top_diff[index]; + + //if (gain_x > 1e-10 && gain_y > 1e-10) { + coeff_x = bottom_data[bottom_index] * gain_y * spatial_scale * src_w * top_diff[index]; + if (gain_x_all > 1e-10) {coeff_x = coeff_x / (gain_x_all*gain_x_all);} + if (gain_y_all > 1e-10) {coeff_x = coeff_x / gain_y_all;} + val_cx = val_cx + (w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) * coeff_x; + val_sx = val_sx + ((w_mask * gain_x_all - gain_x * dgx_final_dwctr_all ) * (pw_+0.5-0.5*pooled_width_) + + ( gain_x_all - gain_x * dgx_final_dwdiff_all)) / pooled_width_ * coeff_x * exp(dst_scl_x); + + coeff_y = bottom_data[bottom_index] * gain_x * spatial_scale * src_h * top_diff[index]; + if (gain_y_all > 1e-10) {coeff_y = coeff_y / (gain_y_all*gain_y_all);} + if (gain_x_all > 1e-10) {coeff_y = coeff_y / gain_x_all;} + val_cy = val_cy + (h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) * coeff_y; + val_sy = val_sy + ((h_mask * gain_y_all - gain_y * dgy_final_dhctr_all ) * (ph_+0.5-0.5*pooled_height_) + + ( gain_y_all - gain_y * dgy_final_dhdiff_all)) / pooled_height_ * coeff_y * exp(dst_scl_y); + //} + } + } + /*int*/ buffer_index = n * (channels * pooled_height * pooled_width * 4) + c * (pooled_height * pooled_width * 4) + ph * (pooled_width * 4) + pw * 4; + bottom_diff_delta_rois_buffer[buffer_index+0] = val_cx; + bottom_diff_delta_rois_buffer[buffer_index+1] = val_cy; + bottom_diff_delta_rois_buffer[buffer_index+2] = val_sx; + bottom_diff_delta_rois_buffer[buffer_index+3] = val_sy; + //} + } +} + + +extern "C" +void inn_ROIWarping_updateGradInputAtomic(THCState *state, + THCudaTensor *gradInput_data, THCudaTensor *data, + THCudaTensor *gradInput_delta_rois, THCudaTensor *delta_rois, + THCudaTensor *gradInput_delta_rois_buffer, + THCudaTensor *gradOutput, THCudaTensor *top_data_buffer, + THCudaTensor* rois, int W, int H, double spatial_scale) +{ + THAssert(THCudaTensor_nDimension(state, data) == 4); + THAssert(THCudaTensor_nDimension(state, top_data_buffer) == 5); + THAssert(THCudaTensor_nDimension(state, rois) == 2 && rois->size[1] == 5); + THAssert(THCudaTensor_nDimension(state, delta_rois) == 2 && delta_rois->size[1] == 5); + THAssert(THCudaTensor_nDimension(state, rois) == THCudaTensor_nDimension(state, delta_rois) && + rois->size[0] == delta_rois->size[0] && + rois->size[1] == delta_rois->size[1]); + THAssert(THCudaTensor_isContiguous(state, data)); + THAssert(THCudaTensor_isContiguous(state, top_data_buffer)); + THAssert(THCudaTensor_isContiguous(state, rois)); + THAssert(THCudaTensor_isContiguous(state, delta_rois)); + long num_rois = rois->size[0]; + long nInputPlane = data->size[1]; + THCudaTensor_resizeAs(state, gradInput_data, data); + THCudaTensor_zero(state, gradInput_data); + THCudaTensor_resizeAs(state, gradInput_delta_rois, delta_rois); + THCudaTensor_zero(state, gradInput_delta_rois); + THCudaTensor_resize5d(state, gradInput_delta_rois_buffer, num_rois, nInputPlane, H, W, 4); + THCudaTensor_zero(state, gradInput_delta_rois_buffer); + + //Backpropagation for data + long count = THCudaTensor_nElement(state, gradInput_data); + for (int nth_roi = 0; nth_roi < num_rois; ++nth_roi) { + ROIWarpBackwardData<<>>( + count, + THCudaTensor_data(state, top_data_buffer), + spatial_scale, nInputPlane, data->size[2], data->size[3], H, W, nth_roi, + THCudaTensor_data(state, rois), + THCudaTensor_data(state, delta_rois), + THCudaTensor_data(state, gradOutput), + THCudaTensor_data(state, gradInput_data) + ); + } + + //Backpropagation for delta_roi + count = THCudaTensor_nElement(state, gradOutput); + ROIWarpBackwardDeltaROI<<>>( + count, + THCudaTensor_data(state, top_data_buffer), + spatial_scale, nInputPlane, data->size[2], data->size[3], H, W, + THCudaTensor_data(state, rois), + THCudaTensor_data(state, delta_rois), + THCudaTensor_data(state, gradOutput), + THCudaTensor_data(state, data), + THCudaTensor_data(state, gradInput_delta_rois_buffer) + ); + + // check for errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("error in inn_ROIWarping_updateGradInputAtomic: %s\n", cudaGetErrorString(err)); + THError("aborting"); + } +} diff --git a/ROIWarping.lua b/ROIWarping.lua new file mode 100644 index 0000000..352a175 --- /dev/null +++ b/ROIWarping.lua @@ -0,0 +1,74 @@ +local ROIWarping,parent = torch.class('inn.ROIWarping', 'nn.Module') +local C = inn.C + +--function ROIWarping:__init(W,H,spatial_scale) +function ROIWarping:__init(H,W) + parent.__init(self) + assert(W and H, 'W and H have to be provided') + self.W = W + self.H = H + --self.spatial_scale = spatial_scale or 1 + + self.grid_gen = inn.ROIWarpingGridGenerator(self.H, self.W) + self.sample = inn.ROIWarpingBilinearSample(self.H, self.W) + + self.gradInput = {} +end + +--function ROIWarping:setSpatialScale(scale) +-- self.spatial_scale = scale +-- return self +--end + +function ROIWarping:updateOutput(input) + assert(#input == 2 or #input == 3) + local data = input[1] + local rois = input[2] + local delta_rois + if #input == 3 then + delta_rois = input[3] + else -- #input == 2 + self.delta_rois = self.delta_rois or rois.new() + self.delta_rois:resizeAs(rois):zero() + self.delta_rois[{{}, 1}] = rois[{{}, 1}] + delta_rois = self.delta_rois + end + + if torch.type(data) == 'torch.CudaTensor' then + self.grid_gen:cuda() + self.sample:cuda() + end + + self.grid_gen:updateOutput({rois, delta_rois}) + self.sample:updateOutput({data, self.grid_gen.output_tmp[1], self.grid_gen.output_tmp[2], self.grid_gen.output_tmp[3]}) + + self.output = self.sample.output + + return self.output +end + +function ROIWarping:updateGradInput(input,gradOutput) + local data = input[1] + local rois = input[2] + local delta_rois + if #input == 3 then + delta_rois = input[3] + else -- #input == 2 + self.delta_rois = self.delta_rois or data.new() + self.delta_rois:resizeAs(rois):zero() + self.delta_rois[{{}, 1}] = rois[{{}, 1}] + delta_rois = self.delta_rois + end + + if torch.type(data) == 'torch.CudaTensor' then + self.grid_gen:cuda() + self.sample:cuda() + end + + self.sample:updateGradInput({data, self.grid_gen.output_tmp[1], self.grid_gen.output_tmp[2], self.grid_gen.output_tmp[3]}, gradOutput) + self.grid_gen:updateGradInput({rois, delta_rois}, {self.sample.gradInput[2], self.sample.gradInput[3]}) + + self.gradInput = {self.sample.gradInput[1], self.grid_gen.gradInput[1], self.grid_gen.gradInput[2]} + + return self.gradInput +end diff --git a/ROIWarpingBilinearSample.cu b/ROIWarpingBilinearSample.cu new file mode 100644 index 0000000..ae890b4 --- /dev/null +++ b/ROIWarpingBilinearSample.cu @@ -0,0 +1,644 @@ +// ------------------------------------------------------------------ +// Fast R-CNN +// Copyright (c) 2015 Microsoft +// Licensed under The MIT License [see fast-rcnn/LICENSE for details] +// Written by Ross Girshick +// ------------------------------------------------------------------ + +// Torch port: +// IMAGINE, Sergey Zagoruyko, Francisco Massa, 2015 + +#include "THC.h" +#include +#include +#include "assert.h" + +#include "common.h" + +#define NUM_BUFFERS 6 +#define PRECISION_LIMIT 1e-10 +#define MIN_BIN_SIZE 2.0f + +using std::max; +using std::min; + +template +__global__ void ROIWarpBilinearSampleForward( + const int nthreads, const Dtype* bottom_data, + const int channels, + const int height, const int width, const int pooled_height, const int pooled_width, + const Dtype* bottom_grid_ctrs, const Dtype* bottom_bin_sizes, const Dtype* bottom_roi_batch_inds, + Dtype* top_data, + Dtype* top_data_buffer) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + int roi_batch_ind = bottom_roi_batch_inds[n] - 1; + int grid_ctr_ind = n * (pooled_height * pooled_width * 2) + ph * (pooled_width * 2) + pw * 2; + int bin_size_ind = n * 2; + + Dtype wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1; + Dtype hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1; + + Dtype bin_size_w = max(bottom_bin_sizes[bin_size_ind+0], MIN_BIN_SIZE); + Dtype bin_size_h = max(bottom_bin_sizes[bin_size_ind+1], MIN_BIN_SIZE); + + Dtype wstart_ = wctr - bin_size_w / 2.0; + Dtype hstart_ = hctr - bin_size_h / 2.0; + Dtype wend_ = wctr + bin_size_w / 2.0; + Dtype hend_ = hctr + bin_size_h / 2.0; + + int wstart = static_cast(floor(wstart_)); + int hstart = static_cast(floor(hstart_)); + int wend = static_cast( ceil(wend_));// + 1; + int hend = static_cast( ceil(hend_));// + 1; + + //top_data[index] = hend+1; + //top_data[index] = wend+1; + //top_data[index] = hstart+1; + //top_data[index] = wstart+1; + //top_data[index] = wstart_+1; + //top_data[index] = wend_+1; + //top_data[index] = hctr+1; + //top_data[index] = wctr+1; + //top_data[index] = bin_size_w; + //top_data[index] = bin_size_h; + //top_data[index] = roi_batch_ind + 1; + + //// Add roi offsets and clip to input boundaries + //hstart = min(max(hstart, 0), height); + //hend = min(max(hend, 0), height); + //wstart = min(max(wstart, 0), width ); + //wend = min(max(wend, 0), width ); + + //top_data[index] = hstart+1; + //top_data[index] = wstart+1; + //top_data[index] = hend+1; + //top_data[index] = wend+1; + + // Auxilliary variables used in backprop + Dtype w_mask = 0, h_mask = 0; + Dtype dgx_final_dwctr_all = 0; + Dtype dgx_final_dwdiff_all = 0; + Dtype dgy_final_dhctr_all = 0; + Dtype dgy_final_dhdiff_all = 0; + + // Define an empty pooling region to be zero + Dtype val = 0; Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0; + bottom_data += (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h <= hend; ++h) { + Dtype h_ = h; + h_mask = ((hstart_ <= h_ && h_ <= hend_) ? 1.0 : 0); + for (int w = wstart; w <= wend; ++w) { + int bottom_index = h * width + w; + Dtype w_ = w; + w_mask = ((wstart_ <= w_ && w_ <= wend_) ? 1.0 : 0); + + //gain_x = (bin_size_w+1) - abs(w_ - wctr); + //gain_y = (bin_size_h+1) - abs(h_ - hctr); + gain_x = w_mask * (bin_size_w - abs(w_ - wctr)); + gain_y = h_mask * (bin_size_h - abs(h_ - hctr)); + + gain = gain_x * gain_y; + + if (0 <= h && h < height && 0 <= w && w < width) { + val = val + gain * bottom_data[bottom_index]; + //val = val + gain; // for debug + } + //val = val + gain; // for debug + + if (h == hstart) { + gain_x_all = gain_x_all + gain_x; + + // Update information used in backprop + dgx_final_dwctr_all = dgx_final_dwctr_all + w_mask * (w_ >= wctr ? 1 : -1); + dgx_final_dwdiff_all = dgx_final_dwdiff_all + w_mask; + } + } + gain_y_all = gain_y_all + gain_y; + + dgy_final_dhctr_all = dgy_final_dhctr_all + h_mask * (h >= hctr ? 1 : -1); + dgy_final_dhdiff_all = dgy_final_dhdiff_all + h_mask; + } + if (gain_x_all > PRECISION_LIMIT) + val = val / gain_x_all; + if (gain_y_all > PRECISION_LIMIT) + val = val / gain_y_all; + top_data[index] = val; + + //top_data[index] = gain_y_all; // for debug + + if (c == 0) { + int buffer_index = n * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS; + top_data_buffer[buffer_index+0] = gain_x_all; + top_data_buffer[buffer_index+1] = gain_y_all; + top_data_buffer[buffer_index+2] = dgx_final_dwctr_all; + top_data_buffer[buffer_index+3] = dgy_final_dhctr_all; + top_data_buffer[buffer_index+4] = dgx_final_dwdiff_all; + top_data_buffer[buffer_index+5] = dgy_final_dhdiff_all; + } + } +} + +extern "C" +void inn_ROIWarpingBilinearSample_updateOutput(THCState *state, + THCudaTensor *output, THCudaTensor *output_buffer, + THCudaTensor *data, THCudaTensor* grid_ctrs, THCudaTensor* bin_sizes, THCudaTensor* roi_batch_inds, + int width, int height) +{ + THAssert(THCudaTensor_nDimension(state, data) == 4); + THAssert(THCudaTensor_nDimension(state, grid_ctrs) == 4 && grid_ctrs->size[3] == 2); + THAssert(THCudaTensor_nDimension(state, bin_sizes) == 2 && bin_sizes->size[1] == 2); + THAssert(THCudaTensor_nDimension(state, roi_batch_inds) == 2 + && roi_batch_inds->size[0] == grid_ctrs->size[0] + && roi_batch_inds->size[0] == bin_sizes->size[0]); + THAssert(THCudaTensor_isContiguous(state, data)); + THAssert(THCudaTensor_isContiguous(state, grid_ctrs)); + THAssert(THCudaTensor_isContiguous(state, bin_sizes)); + + long nInputPlane = data->size[1]; + + // update output + long count = THCudaTensor_nElement(state, output); + ROIWarpBilinearSampleForward<<>>( + count, + THCudaTensor_data(state, data), + nInputPlane, data->size[2], data->size[3], height, width, + THCudaTensor_data(state, grid_ctrs), + THCudaTensor_data(state, bin_sizes), + THCudaTensor_data(state, roi_batch_inds), + THCudaTensor_data(state, output), + THCudaTensor_data(state, output_buffer) + ); + + // check for errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("error in inn_ROIWarpingBilinearSample_updateOutput: %s\n", cudaGetErrorString(err)); + THError("aborting"); + } +} + +template +__global__ void ROIWarpBilinearBackwardData( + const int nthreads, + const int channels, const int height, const int width, const int pooled_height, const int pooled_width, + const int nth_roi, + const Dtype* bottom_grid_ctrs, + const Dtype* bottom_bin_sizes, + const Dtype* bottom_roi_batch_inds, + const Dtype* top_data_buffer, + const Dtype* top_diff, + Dtype* bottom_diff_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, h, w) is an element in the input + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + int roi_batch_ind = bottom_roi_batch_inds[nth_roi] - 1; + + if (roi_batch_ind == n) { + int bin_size_ind = nth_roi * 2; + Dtype bin_size_w = max(bottom_bin_sizes[bin_size_ind+0], MIN_BIN_SIZE); + Dtype bin_size_h = max(bottom_bin_sizes[bin_size_ind+1], MIN_BIN_SIZE); + + ///** for debug **/ + //int top_buffer_index = nth_roi * (pooled_height * pooled_width * NUM_BUFFERS) + h * (pooled_width * NUM_BUFFERS) + w * NUM_BUFFERS; + ////gain_x_all = top_data_buffer[top_buffer_index+0]; + ////gain_y_all = top_data_buffer[top_buffer_index+1]; + ////bottom_diff_data[index] = top_data_buffer[top_buffer_index+1]; + ///** til here **// + + int grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) + 0 * (pooled_width * 2) + 0 * 2; + //Dtype roi_start_w = bottom_grid_ctrs[grid_ctr_ind+0] - bin_size_w / 2.0; + //Dtype roi_start_h = bottom_grid_ctrs[grid_ctr_ind+1] - bin_size_w / 2.0; + + // wstart = floor(roi_start_w + pw * bin_size_w + bin_size_w / 2) + // -> wstart - roi_start_w - bin_size_w / 2 = bin_size_w * pw + // hstart = floor(roi_start_h + ph * bin_size_h + bin_size_h / 2) + // -> hstart - roi_start_h - bin_size_h / 2 = bin_size_h * pw + + int pwstart = pooled_width, pwend = -1; + int wstart = 0, wend = 0; + Dtype wctr, wstart_, wend_; + for (int pw = 0; pw < pooled_width; pw++) { + //wctr = roi_start_w + pw * bin_size_w + bin_size_w / 2.0; + /*int*/ grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) + 0 * (pooled_width * 2) + pw * 2; + wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1; + wstart_ = wctr - bin_size_w / 2.0; + wend_ = wctr + bin_size_w / 2.0; + + wstart = static_cast(floor(wstart_)); + wend = static_cast( ceil(wend_)); + + //wstart = min(max(wstart, 0), width -1); + //wend = min(max(wend, 0), width -1); + + if ((wstart <= w) && (w <= wend)) { + if (pw < pwstart) { + pwstart = pw; + } + if (pw > pwend) { + pwend = pw; + } + } + } + + int phstart = pooled_height, phend = -1; + int hstart = 0, hend = 0; + Dtype hctr, hstart_, hend_; + for (int ph = 0; ph < pooled_height; ++ph) { + /*int*/ grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) + ph * (pooled_width * 2) + 0 * 2; + + hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1; + hstart_ = hctr - bin_size_h / 2.0; + hend_ = hctr + bin_size_h / 2.0; + + hstart = static_cast(floor(hstart_)); + hend = static_cast( ceil(hend_)); + + //hstart = min(max(hstart, 0), height-1); + //hend = min(max(hend, 0), height-1); + + if (hstart <= h && h <= hend) { + if (ph < phstart) { + phstart = ph; + } + if (ph > phend) { + phend = ph; + } + } + } + + //bottom_diff_data[index] = bottom_grid_ctrs[grid_ctr_ind+0] - 1; + //bottom_diff_data[index] = bottom_grid_ctrs[nth_roi * (pooled_height * pooled_width * 2) + 0 * (pooled_width * 2) + 2 * 2 + 0] - bin_size_w / 2.0; + //bottom_diff_data[index] = (static_cast(w) - roi_start_w - bin_size_w / 2.0) * bin_size_pw + 1; + //bottom_diff_data[index] = (static_cast(w+1) - roi_start_w - bin_size_w / 2.0) * bin_size_pw + 1; + //bottom_diff_data[index] = phend+1; //pwend+1; + //bottom_diff_data[index] = phstart+1; //pwstart+1; + //bottom_diff_data[index] = roi_start_w + 1; + //bottom_diff_data[index] = roi_start_h + 1; + + // Clip to top boundaries + phstart = min(max(phstart, 0), pooled_height-1); + phend = min(max(phend, 0), pooled_height-1); + pwstart = min(max(pwstart, 0), pooled_width -1); + pwend = min(max(pwend, 0), pooled_width -1); + + Dtype w_ = w, h_ = h; + //Dtype wctr = 0, hctr = 0; + Dtype gain = 0, gain_x = 0, gain_y = 0, gain_x_all = 0, gain_y_all = 0; + for (int ph = phstart; ph <= phend; ++ph) { + for (int pw = pwstart; pw <= pwend; ++pw) { + int top_index = nth_roi * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width + pw; + int top_buffer_index = nth_roi * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS; + /*int*/ grid_ctr_ind = nth_roi * (pooled_height * pooled_width * 2) + ph * (pooled_width * 2) + pw * 2; + + wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1; + hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1; + gain_x_all = top_data_buffer[top_buffer_index+0]; + gain_y_all = top_data_buffer[top_buffer_index+1]; + + wstart_ = wctr - bin_size_w / 2.0; + wend_ = wctr + bin_size_w / 2.0; + hstart_ = hctr - bin_size_h / 2.0; + hend_ = hctr + bin_size_h / 2.0; + + //gain_x = (bin_size_w+1) - abs(w_ - wctr); + //gain_y = (bin_size_h+1) - abs(h_ - hctr); + gain_x = ((wstart_ <= w_ && w_ <= wend_) ? 1.0 : 0) * (bin_size_w - abs(w_ - wctr)); + gain_y = ((hstart_ <= h_ && h_ <= hend_) ? 1.0 : 0) * (bin_size_h - abs(h_ - hctr)); + + if (gain_x_all > PRECISION_LIMIT) + gain_x = gain_x / gain_x_all; + if (gain_y_all > PRECISION_LIMIT) + gain_y = gain_y / gain_y_all; + + gain = gain_x * gain_y; + bottom_diff_data[index] = bottom_diff_data[index] + gain * top_diff[top_index]; + } + } + + } + } +} + +template +__global__ void ROIWarpBilinearBackwardGridCtrs( + const int nthreads, + const int channels, const int height, const int width, const int pooled_height, const int pooled_width, + //const int c, + const Dtype* bottom_data, + const Dtype* bottom_grid_ctrs, + const Dtype* bottom_bin_sizes, + const Dtype* bottom_roi_batch_inds, + const Dtype* top_data_buffer, + const Dtype* top_diff, + Dtype* bottom_diff_grid_ctrs_buffer) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw, d) is an element in the grid_ctrs_buffer + int d = index % 2; + int pw = (index / 2) % pooled_width; + int ph = (index / 2 / pooled_width) % pooled_height; + int c = (index / 2 / pooled_width / pooled_height) % channels; + int n = index / 2 / pooled_width / pooled_height / channels; + + // get top buffer index and top buffers + int top_buffer_index = n * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS; + Dtype gain_x_all = top_data_buffer[top_buffer_index+0]; + Dtype gain_y_all = top_data_buffer[top_buffer_index+1]; + Dtype dgx_final_dwctr_all = top_data_buffer[top_buffer_index+2]; + Dtype dgy_final_dhctr_all = top_data_buffer[top_buffer_index+3]; + + // get top index + int top_index = n * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width + pw; + + // estimate grad + int roi_batch_ind = bottom_roi_batch_inds[n] - 1; + int grid_ctr_ind = n * (pooled_height * pooled_width * 2) + ph * (pooled_width * 2) + pw * 2; + int bin_size_ind = n * 2; + + Dtype wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1; + Dtype hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1; + + Dtype bin_size_w = max(bottom_bin_sizes[bin_size_ind+0], MIN_BIN_SIZE); + Dtype bin_size_h = max(bottom_bin_sizes[bin_size_ind+1], MIN_BIN_SIZE); + + Dtype wstart_ = wctr - bin_size_w / 2.0; + Dtype hstart_ = hctr - bin_size_h / 2.0; + Dtype wend_ = wctr + bin_size_w / 2.0; + Dtype hend_ = hctr + bin_size_h / 2.0; + + int wstart = static_cast(floor(wstart_)); + int hstart = static_cast(floor(hstart_)); + int wend = static_cast( ceil(wend_));// + 1; + int hend = static_cast( ceil(hend_));// + 1; + + //// Add roi offsets and clip to input boundaries + //hstart = min(max(hstart, 0), height); + //hend = min(max(hend, 0), height); + //wstart = min(max(wstart, 0), width ); + //wend = min(max(wend, 0), width ); + + // Auxilliary variables used in backprop + Dtype w_mask = 0, h_mask = 0; + //Dtype dgx_final_dwctr_all = 0; + //Dtype dgy_final_dhctr_all = 0; + + // output = g * input + // do / dwctr = input * dg / dwctr + // g = gx_final * gy_final + // gx_final = gx / gx_all + // dg / dwctr = dg / dgx_final * dgx_final / dwctr + // = gy_final * ( dgx/dwctr * gx_all - gx * dgx_all/dwctr ) / (gx_all)^2 + // = gy_final * ( (w >= wctr ? 1 : -1) * gx_all - gx * sum_for_w{ (w >= wctr ? 1 : -1) } ) / gx_all^2 + + // Define an empty pooling region to be zero + Dtype val = 0; + Dtype gain = 0, gain_x = 0, gain_y = 0; + Dtype coeff_x = 0, coeff_y = 0; + bottom_data += (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h <= hend; ++h) { + Dtype h_ = h; + h_mask = ((hstart_ <= h_ && h_ <= hend_) ? 1.0 : 0); + for (int w = wstart; w <= wend; ++w) { + int bottom_index = h * width + w; + Dtype w_ = w; + w_mask = ((wstart_ <= w_ && w_ <= wend_) ? 1.0 : 0); + + if (0 <= h && h < height && 0 <= w && w < width) { + //gain_x = (bin_size_w+1) - abs(w_ - wctr); + //gain_y = (bin_size_h+1) - abs(h_ - hctr); + gain_x = w_mask * (bin_size_w - abs(w_ - wctr)); + gain_y = h_mask * (bin_size_h - abs(h_ - hctr)); + + if (d == 0) { + coeff_x = gain_y * bottom_data[bottom_index]; + if (gain_x_all > PRECISION_LIMIT) {coeff_x = coeff_x / (gain_x_all*gain_x_all);} + if (gain_y_all > PRECISION_LIMIT) {coeff_x = coeff_x / gain_y_all;} + val = val + ((w_ >= wctr ? 1 : -1) * gain_x_all - gain_x * dgx_final_dwctr_all ) * coeff_x; + } + else if (d == 1) { + coeff_y = gain_x * bottom_data[bottom_index]; + if (gain_y_all > PRECISION_LIMIT) {coeff_y = coeff_y / (gain_y_all*gain_y_all);} + if (gain_x_all > PRECISION_LIMIT) {coeff_y = coeff_y / gain_x_all;} + val = val + ((h >= hctr ? 1 : -1) * gain_y_all - gain_y * dgy_final_dhctr_all ) * coeff_y; + } + /** for debug **/ + //gain = gain_x * gain_y; + //if (gain_x_all > PRECISION_LIMIT) { gain = gain / gain_x_all; } + //if (gain_y_all > PRECISION_LIMIT) { gain = gain / gain_y_all; } + //val = val + gain * bottom_data[bottom_index]; + ////val = val + gain; + /** til here **/ + } + } + } + bottom_diff_grid_ctrs_buffer[index] = val * top_diff[top_index]; + + /** for debug **/ + //bottom_diff_grid_ctrs_buffer[index] = top_diff[top_index]; + //bottom_diff_grid_ctrs_buffer[index] = dgx_final_dwctr_all; + //bottom_diff_grid_ctrs_buffer[index] = dgy_final_dhctr_all; + //bottom_diff_grid_ctrs_buffer[index] = gain_x_all; + //bottom_diff_grid_ctrs_buffer[index] = gain_y_all; + //bottom_diff_grid_ctrs_buffer[index] = val; + //bottom_diff_grid_ctrs_buffer[index] = d+1; + //bottom_diff_grid_ctrs_buffer[index] = pw+1; + //bottom_diff_grid_ctrs_buffer[index] = ph+1; + /** til here **/ + } +} + +template +__global__ void ROIWarpBilinearBackwardBinSizes( + const int nthreads, + const int channels, const int height, const int width, const int pooled_height, const int pooled_width, + const Dtype* bottom_data, + const Dtype* bottom_grid_ctrs, + const Dtype* bottom_bin_sizes, + const Dtype* bottom_roi_batch_inds, + const Dtype* top_data_buffer, + const Dtype* top_diff, + Dtype* bottom_diff_bin_sizes_buffer) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw, d) is an element in the grid_ctrs_buffer + int d = index % 2; + int pw = (index / 2) % pooled_width; + int ph = (index / 2 / pooled_width) % pooled_height; + int c = (index / 2 / pooled_width / pooled_height) % channels; + int n = index / 2 / pooled_width / pooled_height / channels; + + // get top buffer index and top buffers + int top_buffer_index = n * (pooled_height * pooled_width * NUM_BUFFERS) + ph * (pooled_width * NUM_BUFFERS) + pw * NUM_BUFFERS; + Dtype gain_x_all = top_data_buffer[top_buffer_index+0]; + Dtype gain_y_all = top_data_buffer[top_buffer_index+1]; + Dtype dgx_final_dwdiff_all = top_data_buffer[top_buffer_index+4]; + Dtype dgy_final_dhdiff_all = top_data_buffer[top_buffer_index+5]; + + // get top index + int top_index = n * (channels * pooled_height * pooled_width) + c * (pooled_height * pooled_width) + ph * pooled_width + pw; + + // estimate grad + int roi_batch_ind = bottom_roi_batch_inds[n] - 1; + int grid_ctr_ind = n * (pooled_height * pooled_width * 2) + ph * (pooled_width * 2) + pw * 2; + int bin_size_ind = n * 2; + + Dtype wctr = bottom_grid_ctrs[grid_ctr_ind+0] - 1; + Dtype hctr = bottom_grid_ctrs[grid_ctr_ind+1] - 1; + + Dtype bin_size_w = bottom_bin_sizes[bin_size_ind+0]; + Dtype bin_size_h = bottom_bin_sizes[bin_size_ind+1]; + + Dtype wstart_ = wctr - bin_size_w / 2.0; + Dtype hstart_ = hctr - bin_size_h / 2.0; + Dtype wend_ = wctr + bin_size_w / 2.0; + Dtype hend_ = hctr + bin_size_h / 2.0; + + int wstart = static_cast(floor(wstart_)); + int hstart = static_cast(floor(hstart_)); + int wend = static_cast( ceil(wend_)); + int hend = static_cast( ceil(hend_)); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height-1); + hend = min(max(hend, 0), height-1); + wstart = min(max(wstart, 0), width -1); + wend = min(max(wend, 0), width -1); + + // Define an empty pooling region to be zero + Dtype val = 0; + Dtype gain = 0, gain_x = 0, gain_y = 0; + Dtype coeff_x = 0, coeff_y = 0; + bottom_data += (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h <= hend; ++h) { + Dtype h_ = h; + for (int w = wstart; w <= wend; ++w) { + int bottom_index = h * width + w; + Dtype w_ = w; + + gain_x = (bin_size_w+1) - abs(w_ - wctr); + gain_y = (bin_size_h+1) - abs(h_ - hctr); + + if (d == 0) { + coeff_x = gain_y * bottom_data[bottom_index]; + if (gain_x_all > PRECISION_LIMIT) {coeff_x = coeff_x / (gain_x_all*gain_x_all);} + if (gain_y_all > PRECISION_LIMIT) {coeff_x = coeff_x / gain_y_all;} + val = val + (1 * gain_x_all - gain_x * dgx_final_dwdiff_all) * coeff_x; + } + else if (d == 1) { + coeff_y = gain_x * bottom_data[bottom_index]; + if (gain_y_all > PRECISION_LIMIT) {coeff_y = coeff_y / (gain_y_all*gain_y_all);} + if (gain_x_all > PRECISION_LIMIT) {coeff_y = coeff_y / gain_x_all;} + val = val + (1 * gain_y_all - gain_y * dgy_final_dhdiff_all) * coeff_y; + } + /** for debug **/ + //gain = gain_x * gain_y; + //if (gain_x_all > PRECISION_LIMIT) { gain = gain / gain_x_all; } + //if (gain_y_all > PRECISION_LIMIT) { gain = gain / gain_y_all; } + //val = val + gain * bottom_data[bottom_index]; + ////val = val + gain; + /** til here **/ + } + } + bottom_diff_bin_sizes_buffer[index] = val * top_diff[top_index]; + + /** for debug **/ + //bottom_diff_grid_ctrs_buffer[index] = top_diff[top_index]; + //bottom_diff_grid_ctrs_buffer[index] = dgx_final_dwctr_all; + //bottom_diff_grid_ctrs_buffer[index] = dgy_final_dhctr_all; + //bottom_diff_grid_ctrs_buffer[index] = gain_x_all; + //bottom_diff_grid_ctrs_buffer[index] = val; + //bottom_diff_grid_ctrs_buffer[index] = d+1; + //bottom_diff_grid_ctrs_buffer[index] = pw+1; + //bottom_diff_grid_ctrs_buffer[index] = ph+1; + /** til here **/ + } +} + +extern "C" +void inn_ROIWarpingBilinearSample_updateGradInput(THCState *state, + THCudaTensor *gradInput_data, THCudaTensor *data, + THCudaTensor *gradInput_grid_ctrs, THCudaTensor *grid_ctrs, THCudaTensor *gradInput_grid_ctrs_buffer, + THCudaTensor *gradInput_bin_sizes, THCudaTensor *bin_sizes, THCudaTensor *gradInput_bin_sizes_buffer, + THCudaTensor *roi_batch_inds, + THCudaTensor *output_buffer, + THCudaTensor *gradOutput, + int pooled_height, int pooled_width) +{ + THAssert(THCudaTensor_nDimension(state, data) == 4); + THAssert(THCudaTensor_nDimension(state, grid_ctrs) == 4 && grid_ctrs->size[3] == 2); + THAssert(THCudaTensor_nDimension(state, bin_sizes) == 2 && bin_sizes->size[1] == 2); + THAssert(THCudaTensor_nDimension(state, roi_batch_inds) == 2 + && roi_batch_inds->size[0] == grid_ctrs->size[0] + && roi_batch_inds->size[0] == bin_sizes->size[0]); + THAssert(THCudaTensor_isContiguous(state, data)); + THAssert(THCudaTensor_isContiguous(state, grid_ctrs)); + THAssert(THCudaTensor_isContiguous(state, bin_sizes)); + + long num_rois = grid_ctrs->size[0]; + long nInputPlane = data->size[1]; + + long count = 0; + + // backpropagation for data + for (int nth_roi = 0; nth_roi < num_rois; ++nth_roi) { + count = THCudaTensor_nElement(state, gradInput_data); + ROIWarpBilinearBackwardData<<>>( + count, + nInputPlane, data->size[2], data->size[3], pooled_height, pooled_width, + nth_roi, + THCudaTensor_data(state, grid_ctrs), + THCudaTensor_data(state, bin_sizes), + THCudaTensor_data(state, roi_batch_inds), + THCudaTensor_data(state, output_buffer), + THCudaTensor_data(state, gradOutput), + THCudaTensor_data(state, gradInput_data) + ); + } + + // backpropagation for grid_ctrs + count = THCudaTensor_nElement(state, gradInput_grid_ctrs_buffer); + ROIWarpBilinearBackwardGridCtrs<<>>( + count, + nInputPlane, data->size[2], data->size[3], pooled_height, pooled_width, + THCudaTensor_data(state, data), + THCudaTensor_data(state, grid_ctrs), + THCudaTensor_data(state, bin_sizes), + THCudaTensor_data(state, roi_batch_inds), + THCudaTensor_data(state, output_buffer), + THCudaTensor_data(state, gradOutput), + THCudaTensor_data(state, gradInput_grid_ctrs_buffer) + ); + + // backpropagation for bin_sizes + count = THCudaTensor_nElement(state, gradInput_bin_sizes_buffer); + ROIWarpBilinearBackwardBinSizes<<>>( + count, + nInputPlane, data->size[2], data->size[3], pooled_height, pooled_width, + THCudaTensor_data(state, data), + THCudaTensor_data(state, grid_ctrs), + THCudaTensor_data(state, bin_sizes), + THCudaTensor_data(state, roi_batch_inds), + THCudaTensor_data(state, output_buffer), + THCudaTensor_data(state, gradOutput), + THCudaTensor_data(state, gradInput_bin_sizes_buffer) + ); + + // check for errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("error in inn_ROIWarpingBilinearSample_updateGradInput: %s\n", cudaGetErrorString(err)); + THError("aborting"); + } +} + +#undef NUM_BUFFERS +#undef PRECISION_LIMIT +#undef MIN_BIN_SIZE diff --git a/ROIWarpingBilinearSample.lua b/ROIWarpingBilinearSample.lua new file mode 100644 index 0000000..bff068e --- /dev/null +++ b/ROIWarpingBilinearSample.lua @@ -0,0 +1,122 @@ +local ROIWarpingBilinearSample,parent = torch.class('inn.ROIWarpingBilinearSample', 'nn.Module') +local C = inn.C + +local buffer_numbers = 6 + +--function ROIWarpingBilinearSample:__init(height, width, spatial_scale) +function ROIWarpingBilinearSample:__init(height, width) + parent.__init(self) + assert(width and height, 'height and width have to be provided') + self.width = width + self.height = height + --self.spatial_scale = spatial_scale or 1 + self.gradInput = {} +end + +--function ROIWarpingBilinearSample:setSpatialScale(scale) +-- self.spatial_scale = scale +-- return self +--end + +function ROIWarpingBilinearSample:updateOutput(input) + assert(#input == 4) + local data = input[1] + local grid_ctrs = input[2] + local bin_sizes = input[3] + local roi_batch_inds = input[4] + + local num_rois = roi_batch_inds:size(1) + local nchannels = data:size(2) + + assert(grid_ctrs:size(1) == num_rois and + grid_ctrs:size(2) == self.height and + grid_ctrs:size(3) == self.width and + grid_ctrs:size(4) == 2) + assert(bin_sizes:size(1) == num_rois and + bin_sizes:size(2) == 2) + + self.output = self.output or data.new() + self.output:resize(num_rois, nchannels, self.height, self.width):fill(0) + self.output_buffer = self.output_buffer or data.new() + self.output_buffer:resize(num_rois, self.height, self.width, buffer_numbers):fill(0) + + C.inn_ROIWarpingBilinearSample_updateOutput(cutorch.getState(), + self.output:cdata(), self.output_buffer:cdata(), + data:cdata(), grid_ctrs:cdata(), bin_sizes:cdata(), roi_batch_inds:cdata(), + self.width, self.height + )--, self.spatial_scale) + + return self.output +end + +function ROIWarpingBilinearSample:updateGradInput(input,gradOutput) + assert(#input == 4) + local data = input[1] + local grid_ctrs = input[2] + local bin_sizes = input[3] + local roi_batch_inds = input[4] + + local batch_size = data:size(1) + local num_rois = roi_batch_inds:size(1) + local nchannels = data:size(2) + + assert(self.output_buffer) + assert(self.output_buffer:size(1) == num_rois and + self.output_buffer:size(2) == self.height and + self.output_buffer:size(3) == self.width and + self.output_buffer:size(4) == buffer_numbers) + + self.gradInput_data = self.gradInput_data or data.new() -- b x c x h x w + self.gradInput_grid_ctrs = self.gradInput_grid_ctrs or grid_ctrs.new() -- n x h x w x 2 + self.gradInput_grid_ctrs_buffer = self.gradInput_grid_ctrs_buffer or grid_ctrs.new() -- n x c x h x w x 2 + self.gradInput_bin_sizes = self.gradInput_bin_sizes or bin_sizes.new() -- n x 2 + self.gradInput_bin_sizes_buffer = self.gradInput_bin_sizes_buffer or bin_sizes.new() -- n x c x h x w x 2 + self.gradInput_roi_batch_inds = self.gradInput_roi_batch_inds or roi_batch_inds.new() -- n x 2 + + self.gradInput_data:resizeAs(data):fill(0) + self.gradInput_grid_ctrs:resizeAs(grid_ctrs):fill(0) + self.gradInput_grid_ctrs_buffer:resize(num_rois, nchannels, self.height, self.width, 2):fill(0) + self.gradInput_bin_sizes:resizeAs(bin_sizes):fill(0) + self.gradInput_bin_sizes_buffer:resize(num_rois, nchannels, self.height, self.width, 2):fill(0) + self.gradInput_roi_batch_inds:resize(num_rois, 2):fill(0) + + --print(self.output_buffer:select(4,1)) + --print(self.output_buffer:select(4,2)) + --print(self.output_buffer:select(4,3)) + --print(self.output_buffer:select(4,4)) + + C.inn_ROIWarpingBilinearSample_updateGradInput(cutorch.getState(), + self.gradInput_data:cdata(), data:cdata(), + self.gradInput_grid_ctrs:cdata(), grid_ctrs:cdata(), self.gradInput_grid_ctrs_buffer:cdata(), + self.gradInput_bin_sizes:cdata(), bin_sizes:cdata(), self.gradInput_bin_sizes_buffer:cdata(), + roi_batch_inds:cdata(), + self.output_buffer:cdata(), + gradOutput:cdata(), + self.height, self.width + ) --, self.spatial_scale) + + --print(self.gradInput_bin_sizes_buffer) + + self.gradInput_grid_ctrs:copy(self.gradInput_grid_ctrs_buffer:sum(2):view(num_rois, self.height, self.width, 2)) + self.gradInput_bin_sizes:copy(self.gradInput_bin_sizes_buffer:sum(2):sum(3):sum(4):view(num_rois, 2)) + + --print(self.gradInput_grid_ctrs_buffer:select(2, 1):select(4, 1)) + ----print(self.gradInput_grid_ctrs_buffer:select(2, 1):select(4, 2)) + --print(self.gradInput_grid_ctrs_buffer:select(2, 2):select(4, 1)) + --print(self.gradInput_grid_ctrs_buffer:select(2, 3):select(4, 1)) + --for c = 1, nchannels do + -- print(self.gradInput_grid_ctrs_buffer:select(2, c)) + --end + + self.gradInput[1] = self.gradInput_data + self.gradInput[2] = self.gradInput_grid_ctrs + self.gradInput[3] = self.gradInput_bin_sizes + self.gradInput[4] = self.gradInput_roi_batch_inds + + return self.gradInput +end + +function ROIWarpingBilinearSample:clearState() + nn.utils.clear(self, 'gradInput_data', 'gradInput_grid_ctrs', 'gradInput_grid_ctrs_buffer', 'gradInput_bin_sizes', 'gradInput_bin_sizes_buffer', 'gradInput_roi_batch_inds') + return parent.clearState(self) +end diff --git a/ROIWarpingGridGenerator.lua b/ROIWarpingGridGenerator.lua new file mode 100644 index 0000000..e76a4f3 --- /dev/null +++ b/ROIWarpingGridGenerator.lua @@ -0,0 +1,236 @@ +--[[ + This code is borrowed from AffineGridGeneratorBHWD.lua in https://github.com/qassemoquab/stnbhwd +]] + +local RWGG, parent = torch.class('inn.ROIWarpingGridGenerator', 'nn.Module') + +local function fast_rcnn_bbox_transform_inv(rois, delta_rois) +-- rois : N by 4 torch.Tensor; for each row, rois[{n, {}}] == x_start, y_start, x_end, y_end (in image coordinates) +-- delta_rois : N by 4 torch.Tensor; for each row, delta_rois[{n, {}}] == dx, dy, dw, dh (in fast-rcnn notation) + + local src_w = rois[{{},3}] - rois[{{},1}] + 1; + local src_h = rois[{{},4}] - rois[{{},2}] + 1; + local src_ctr_x = rois[{{},1}] + 0.5*(src_w-1.0); + local src_ctr_y = rois[{{},2}] + 0.5*(src_h-1.0); + + local dst_ctr_x = delta_rois[{{},1}]; -- dx (in fast-rcnn notation) + local dst_ctr_y = delta_rois[{{},2}]; -- dy (in fast-rcnn notation) + local dst_scl_x = delta_rois[{{},3}]; -- dw (in fast-rcnn notation) + local dst_scl_y = delta_rois[{{},4}]; -- dh (in fast-rcnn notation) + + local pred_ctr_x = torch.cmul(dst_ctr_x, src_w) + src_ctr_x; + local pred_ctr_y = torch.cmul(dst_ctr_y, src_h) + src_ctr_y; + local pred_w = torch.cmul(torch.exp(dst_scl_x), src_w); + local pred_h = torch.cmul(torch.exp(dst_scl_y), src_h); + + local roi_start_w = pred_ctr_x - 0.5*(pred_w-1); + local roi_start_h = pred_ctr_y - 0.5*(pred_h-1); + local roi_end_w = pred_ctr_x + 0.5*(pred_w-1); + local roi_end_h = pred_ctr_y + 0.5*(pred_h-1); + + return torch.cat({roi_start_w, roi_start_h, roi_end_w, roi_end_h}, 2) +end + +--function RWGG:__init(height, width, spatial_scale) +function RWGG:__init(height, width) + parent.__init(self) + assert(height > 1) + assert(width > 1) + self.height = height + self.width = width + + self.output_tmp = {} + self.gradInput = {} + + --self.spatial_scale = spatial_scale or 1 + + self.baseGrid = torch.Tensor(self.height, self.width, 2) -- Grid for input image + for i=1,self.width do + self.baseGrid:select(3,1):select(2,i):fill(i-1) + end + for j=1,self.height do + self.baseGrid:select(3,2):select(1,j):fill(j-1) + end + self.batchGrid = torch.Tensor(1, height, width, 2):copy(self.baseGrid) +end + +function RWGG:updateOutput(input) --(_transformMatrix) + assert(#input == 1 or #input == 2) + local rois = input[1] + local delta_rois + if #input == 2 then + delta_rois = input[2] + else -- #input == 2 + self.delta_rois = self.delta_rois or rois.new() + self.delta_rois:resizeAs(rois):zero() + self.delta_rois[{{}, 1}] = rois[{{}, 1}] + delta_rois = self.delta_rois + end + assert(rois:dim() == 2 and delta_rois:dim() == 2) + assert(rois:size(2) == 5 and delta_rois:size(2) == 5) + + local batch_size = rois:size(1) + + if self.batchGrid:size(1) ~= batch_size then + self.batchGrid:resize(batch_size, self.height, self.width, 2) + for i=1,batch_size do + self.batchGrid:select(1,i):copy(self.baseGrid) + end + end + + -- allocate output + self.output_tmp[1] = self.output_tmp[1] or rois.new() + self.output_tmp[2] = self.output_tmp[2] or rois.new() + self.output_tmp[3] = self.output_tmp[3] or rois.new() + local grid_ctrs = self.output_tmp[1] + local bin_sizes = self.output_tmp[2] + local roi_batch_inds = self.output_tmp[3] + + -- prepare msc + local pred_rois = fast_rcnn_bbox_transform_inv(rois[{{}, {2, 5}}], delta_rois[{{}, {2, 5}}]) + + local rois_width = pred_rois[{{}, 3}] - pred_rois[{{}, 1}] + local rois_height = pred_rois[{{}, 4}] - pred_rois[{{}, 2}] + local rois_start_width = pred_rois[{{}, 1}] + local rois_start_height = pred_rois[{{}, 2}] + + local bin_size_w = rois_width / self.width + local bin_size_h = rois_height / self.height + + grid_ctrs:resize(batch_size, self.height, self.width, 2):fill(0) -- b x h x w x 2 (x, y == width, height) + bin_sizes:resize(batch_size, 2):fill(0) -- b x 2 (x, y == width, height) + roi_batch_inds:resize(batch_size, 1):fill(0) + + -- update roi_batch_inds + roi_batch_inds:copy(rois[{{},{1}}]) + + -- update bin_sizes + bin_sizes:select(2,1):copy(bin_size_w:reshape(batch_size, 1)) -- width + bin_sizes:select(2,2):copy(bin_size_h:reshape(batch_size, 1)) -- height + + -- update grid_ctrs + local grid_ctrs_w = grid_ctrs[{{}, {}, {}, {1}}] -- allocate address + grid_ctrs_w:copy(self.batchGrid[{{}, {}, {}, {1}}]) + :cmul(bin_size_w:reshape(batch_size, 1, 1, 1) + :expand(batch_size, self.height, self.width, 1)) + :add(bin_size_w:reshape(batch_size, 1, 1, 1) + :expand(batch_size, self.height, self.width, 1) / 2) + :add(rois_start_width:reshape(batch_size, 1, 1, 1) + :expand(batch_size, self.height, self.width, 1)) + local grid_ctrs_h = grid_ctrs[{{}, {}, {}, {2}}] -- allocate address + grid_ctrs_h:copy(self.batchGrid[{{}, {}, {}, {2}}]) + :cmul(bin_size_h:reshape(batch_size, 1, 1, 1) + :expand(batch_size, self.height, self.width, 1)) + :add(bin_size_h:reshape(batch_size, 1, 1, 1) + :expand(batch_size, self.height, self.width, 1) / 2) + :add(rois_start_height:reshape(batch_size, 1, 1, 1) + :expand(batch_size, self.height, self.width, 1)) + + return self.output_tmp +end + +function RWGG:updateGradInput(input, gradOutput) --(_transformMatrix, _gradGrid) + assert(#input == 1 or #input == 2) + local rois = input[1] + local delta_rois + if #input == 2 then + delta_rois = input[2] + else -- #input == 2 + self.delta_rois = self.delta_rois or rois.new() + self.delta_rois:resizeAs(rois):zero() + self.delta_rois[{{}, 1}] = rois[{{}, 1}] + delta_rois = self.delta_rois + end + assert(rois:dim() == 2 and delta_rois:dim() == 2) + assert(rois:size(2) == 5 and delta_rois:size(2) == 5) + + local batch_size = rois:size(1) + + self.batchGrid = self.batchGrid:typeAs(rois) + self.baseGrid = self.baseGrid:typeAs(rois) + + if self.batchGrid:size(1) ~= batch_size then + self.batchGrid:resize(batch_size, self.height, self.width, 2) + for i=1,batch_size do + self.batchGrid:select(1,i):copy(self.baseGrid) + end + end + + -- init output buffer + self.gradInput_rois = self.gradInput_rois or rois.new() + self.gradInput_delta_rois = self.gradInput_delta_rois or delta_rois.new() + self.gradInput_rois:resizeAs(rois):zero() + self.gradInput_delta_rois:resizeAs(delta_rois):zero() + + -- prepare msc + --local pred_rois = fast_rcnn_bbox_transform_inv(rois[{{}, {2, 5}}], delta_rois[{{}, {2, 5}}]) + + --local rois_width = pred_rois[{{}, 3}] - pred_rois[{{}, 1}] + --local rois_height = pred_rois[{{}, 4}] - pred_rois[{{}, 2}] + + --local bin_size_w = rois_width / self.width + --local bin_size_h = rois_height / self.height + + local src_width = rois[{{}, {4}}] - rois[{{}, {2}}] + 1; src_width = src_width:reshape(batch_size, 1, 1) + local src_height = rois[{{}, {5}}] - rois[{{}, {3}}] + 1; src_height = src_height:reshape(batch_size, 1, 1) + + local flattenedBatchGrid = self.batchGrid:view(batch_size, self.width*self.height, 2) + + -- grad from grid_ctrs + + -- drsw / dcx = drsw / dpcx * dpcx / dcx = spatial_scale * src_w + -- drew / dcx = drew / dpcx * dpcx / dcx = spatial_scale * src_w + -- drsh / dcy = drsh / dpcy * dpcy / dcy = spatial_scale * src_h + -- dreh / dcy = dreh / dpcy * dpcy / dcy = spatial_scale * src_h + + -- drsw / dsx = drsw / dpw * dpw / dsx = -0.5 * spatial_scale * src_w * exp(dsx) + -- drew / dsx = drew / dpw * dpw / dsx = 0.5 * spatial_scale * src_w * exp(dsx) + -- drsh / dsy = drsh / dph * dph / dsy = -0.5 * spatial_scale * src_h * exp(dsy) + -- dreh / dsy = dreh / dph * dph / dsy = 0.5 * spatial_scale * src_h * exp(dsy) + + -- grid_ctr_w = rsw + bin_size_w / 2 + pw * bin_size_w + -- = rsw + (0.5 + pw) * bin_size_w + -- = rsw + (0.5 + pw) * (rew - rsw) / self.width + -- = f(rsw, rew) + -- dwctr / dcx = dwctr / drsw * drsw / dcx + dwctr / drew * drew / dcx + -- = (1 + (0.5 + pw) / self.width * (-1)) * src_w + ((0.5 + pw) / self.width * 1) * src_w + -- = spatial_scale * src_w + -- dhctr / dcy = spatial_scale * src_h + -- dwctr / dsx = dwctr / drsw * drsw / dsx + dwctr / drew * drew / dsx + -- = (1 + (0.5 + pw) / self.width * (-1)) * (-0.5 * spatial_scale * src_w * exp(dsx)) + -- + ( (0.5 + pw) / self.width * 1 ) * ( 0.5 * spatial_scale * src_w * exp(dsx)) + -- = (-1 + (0.5 + pw) / self.width * 2) * 0.5 * spatial_scale * src_w * exp(dsx) + -- = 0.5 * spatial_scale * src_w * exp(dsx) * (-1 + 2 * (0.5 + pw) / self.width) + -- = ((pw + 0.5) / self.width - 0.5) * spatial_scale * src_w * exp(dsx) + -- dhctr / dsy = ((ph + 0.5) / self.height - 0.5) * spatial_scale * src_h * exp(dsy) + + -- grad from bin_sizes + + -- dbw / dcx = dbw / drw * drw / dcx = 0 + -- dbh / dcy = dbh / drh * drh / dcy = 0 + -- dbw / dsx = dbw / drw * drw / dsx = 1 / self.width * spatial_scale * src_w * exp(dsx) + -- dbh / dsy = dbh / drh * drh / dsy = 1 / self.height * spatial_scale * src_h * exp(dsy) + + local flattened_grid_ctrs = gradOutput[1]:view(batch_size, self.height*self.width, 2) -- b x ph x pw x 2 + local flattened_bin_sizes = gradOutput[2] -- b x 2 + local flattened_gradInput_delta_rois = self.gradInput_delta_rois[{{},{2, 5}}] -- b x 4 + + flattened_gradInput_delta_rois[{{}, {1}}]:copy(torch.sum(torch.cmul( src_width:expand(batch_size, self.height * self.width, 1), flattened_grid_ctrs[{{}, {}, {1}}]), 2):reshape(batch_size, 1)) + flattened_gradInput_delta_rois[{{}, {2}}]:copy(torch.sum(torch.cmul(src_height:expand(batch_size, self.height * self.width, 1), flattened_grid_ctrs[{{}, {}, {2}}]), 2):reshape(batch_size, 1)) + flattened_gradInput_delta_rois[{{}, {3}}]:copy(torch.sum(torch.cmul( ((flattenedBatchGrid[{{}, {}, {1}}] + 0.5) / self.width - 0.5), + flattened_grid_ctrs[{{}, {}, {1}}]), 2):reshape(batch_size, 1)) + flattened_gradInput_delta_rois[{{}, {3}}]:add(torch.sum(torch.mul(flattened_bin_sizes[{{}, {1}}], 1/self.width), 2)) + :cmul(torch.exp(delta_rois[{{}, {4}}])) + :cmul(src_width) + flattened_gradInput_delta_rois[{{}, {4}}]:copy(torch.sum(torch.cmul( ((flattenedBatchGrid[{{}, {}, {2}}] + 0.5) / self.height - 0.5), + flattened_grid_ctrs[{{}, {}, {2}}]), 2):reshape(batch_size, 1)) + flattened_gradInput_delta_rois[{{}, {4}}]:add(torch.sum(torch.mul(flattened_bin_sizes[{{}, {2}}], 1/self.height), 2)) + :cmul(torch.exp(delta_rois[{{}, {5}}])) + :cmul(src_height) + + -- update output + self.gradInput[1] = self.gradInput_rois + self.gradInput[2] = self.gradInput_delta_rois + + return self.gradInput +end diff --git a/ffi.lua b/ffi.lua index 398af4f..476b1b5 100644 --- a/ffi.lua +++ b/ffi.lua @@ -20,6 +20,29 @@ void inn_ROIPooling_updateOutputV2(THCState *state, void inn_ROIPooling_updateGradInputAtomic(THCState *state, THCudaTensor *gradInput, THCudaTensor *indices, THCudaTensor *data, THCudaTensor *gradOutput, THCudaTensor* rois, int W, int H, double spatial_scale); + +void inn_ROIWarping_updateOutput(THCState *state, + THCudaTensor *output, THCudaTensor *output_buffer, + THCudaTensor *data, THCudaTensor* rois, THCudaTensor* delta_rois, int W, int H, double spatial_scale); +void inn_ROIWarping_updateGradInputAtomic(THCState *state, + THCudaTensor *gradInput_data, THCudaTensor *data, + THCudaTensor *gradInput_delta_rois, THCudaTensor *delta_rois, + THCudaTensor *gradInput_delta_rois_buffer, + THCudaTensor *gradOutput, THCudaTensor *top_data_buffer, + THCudaTensor* rois, int W, int H, double spatial_scale); + +void inn_ROIWarpingBilinearSample_updateOutput(THCState *state, + THCudaTensor *output, THCudaTensor *output_buffer, + THCudaTensor *data, THCudaTensor* grid_ctrs, THCudaTensor* bin_sizes, THCudaTensor* roi_batch_inds, + int width, int height); +void inn_ROIWarpingBilinearSample_updateGradInput(THCState *state, + THCudaTensor *gradInput_data, THCudaTensor *data, + THCudaTensor *gradInput_grid_ctrs, THCudaTensor *grid_ctrs, THCudaTensor *gradInput_grid_ctrs_buffer, + THCudaTensor *gradInput_bin_sizes, THCudaTensor *bin_sizes, THCudaTensor *gradInput_bin_sizes_buffer, + THCudaTensor *roi_batch_inds, + THCudaTensor *output_buffer, + THCudaTensor *gradOutput, + int pooled_height, int pooled_width); ]] return ffi.load(libpath) diff --git a/init.lua b/init.lua index 3122974..c4796ba 100644 --- a/init.lua +++ b/init.lua @@ -9,4 +9,7 @@ require 'inn.MeanSubtraction' require 'inn.SpatialPyramidPooling' require 'inn.SpatialSameResponseNormalization' require 'inn.ROIPooling' +require 'inn.ROIWarping' +require 'inn.ROIWarpingGridGenerator' +require 'inn.ROIWarpingBilinearSample' return inn diff --git a/test/test_jacobian.lua b/test/test_jacobian.lua index 1ef9281..0b5aa79 100644 --- a/test/test_jacobian.lua +++ b/test/test_jacobian.lua @@ -7,7 +7,31 @@ local precision = 1e-3 local inntest = torch.TestSuite() +local function delta_rois_to_rois(rois, delta_rois) + local src_w = rois[{{},3}] - rois[{{},1}] + 1; + local src_h = rois[{{},4}] - rois[{{},2}] + 1; + local src_ctr_x = rois[{{},1}] + 0.5*(src_w-1.0); + local src_ctr_y = rois[{{},2}] + 0.5*(src_h-1.0); + local dst_ctr_x = delta_rois[{{},1}]; -- dx (in fast-rcnn notation) = cx (in here) + local dst_ctr_y = delta_rois[{{},2}]; -- dy (in fast-rcnn notation) = cy (in here) + local dst_scl_x = delta_rois[{{},3}]; -- dw (in fast-rcnn notation) = sx (in here) + local dst_scl_y = delta_rois[{{},4}]; -- dh (in fast-rcnn notation) = sy (in here) + + local pred_ctr_x = torch.cmul(dst_ctr_x, src_w) + src_ctr_x; + local pred_ctr_y = torch.cmul(dst_ctr_y, src_h) + src_ctr_y; + local pred_w = torch.cmul(torch.exp(dst_scl_x), src_w); + local pred_h = torch.cmul(torch.exp(dst_scl_y), src_h); + + local roi_start_w = pred_ctr_x - 0.5*(pred_w-1) + local roi_start_h = pred_ctr_y - 0.5*(pred_h-1) + local roi_end_w = pred_ctr_x + 0.5*(pred_w-1) + local roi_end_h = pred_ctr_y + 0.5*(pred_h-1) + + return torch.cat({roi_start_w, roi_start_h, roi_end_w, roi_end_h}, 2) +end + +--[[ function inntest.SpatialStochasticPooling() local from = math.random(1,5) local ki = math.random(1,4) @@ -98,7 +122,7 @@ function inntest.SpatialSameResponseNormalization() local err = jac.testJacobian(module, input, nil, nil, 1e-3) mytester:assertlt(err, precision, 'error on state (Batch) ') end - +]] function randROI(sz, n) assert(sz:size()==4, "need 4d size") local roi=torch.Tensor(n,5) @@ -127,7 +151,7 @@ function testJacobianWithRandomROI(cls, v2) for i=1,numRepeat do local input = torch.rand(batchSize, 1, H, W); local roi = randROI(input:size(), numRoi) - local module = cls.new(h, w, 1, roi) + local module = cls.new(w, h, 1, roi) module.v2 = v2 local err = jac.testJacobian(module, input, nil, nil, 1e-3) mytester:assertlt(err, precision, 'error on ROIPooling '..(v2 and 'v2' or 'v1')) @@ -152,6 +176,416 @@ function inntest.ROIPooling() testJacobianWithRandomROI(FixedROIPooling, true) end +function testJacobianWithRandomROIForROIWarpingData(cls) + --pooling grid size + local w=4; + local h=4; + --input size + local W=w*2; + local H=h*2; + + local batchSize = 3 + local numRoi = batchSize + local numRepeat = 3 + + torch.manualSeed(0) + for i=1,numRepeat do + local input = torch.rand(batchSize, 1, H, W); + local roi = randROI(input:size(), numRoi) + local delta_roi = roi:clone() + delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4) + local module = cls.new(w, h, 1, roi, delta_roi) + + local orig = input:clone() + local err = jac.testJacobian(module, input, nil, nil, 1e-3) + mytester:assertlt(err, precision, 'error on ROIWarping ') + end +end + +function inntest.ROIWarpingData() + local FixedROIWarping, parent = torch.class('FixedROIWarping', 'inn.ROIWarping') + function FixedROIWarping:__init(W, H, s, roi, delta_roi) + self.roi = roi + self.delta_roi = delta_roi + parent.__init(self, W, H, s) + self:cuda() + end + + function FixedROIWarping:updateOutput(input) + return parent.updateOutput(self,{input:cuda(), self.roi, self.delta_roi}) + end + function FixedROIWarping:updateGradInput(input, gradOutput) + return parent.updateGradInput(self,{input:cuda(), self.roi, self.delta_roi}, gradOutput)[1] + end + + testJacobianWithRandomROIForROIWarpingData(FixedROIWarping) +end +--[[ +---------------------------------------------------------------------- +function testJacobianWithRandomROIForROIWarpingDeltaROI(cls) + --pooling grid size + local w=4; + local h=4; + --img size + local W=w*2; + local H=h*2; + + + local batchSize = 3 + local numRoi = batchSize + local numRepeat = 3 + + torch.manualSeed(0) + for i=1,numRepeat do + local img = torch.rand(batchSize, 3, H, W); + --local roi = torch.Tensor{1, 1, 1, W, H}:reshape(1, 5) + local roi = randROI(img:size(), numRoi) + local input = torch.rand(numRoi, 4) + local module = cls.new(w, h, 1, roi, img) + + print('---0000000000000000000000000000') + --print(img) + print(roi) + print(input) + print(delta_rois_to_rois(roi[{{}, {2,5}}], input)) + + local perturbation = 1e-3 + local jac_fprop = jac.forward(module, input, input, 1e-3) + --module:forward(input) + local jac_bprop = jac.backward(module, input) + + --print('---1111111111111111111111111111') + print(jac_fprop) + print('---2222222222222222222222222222') + print(jac_bprop) + + local err = jac.testJacobian(module, input, -1, 1, 1e-3) + mytester:assertlt(err, precision, 'error on ROIWarping ') + end +end + +function inntest.ROIWarpingDeltaROI() + local FixedROIWarpingDeltaROI, parent = torch.class('FixedROIWarpingDeltaROI', 'inn.ROIWarping') + function FixedROIWarpingDeltaROI:__init(W, H, s, roi, img) + self.img = img + self.roi = roi + self.delta_roi = self.roi:clone() + parent.__init(self, W, H, s) + self:cuda() + end + + function FixedROIWarpingDeltaROI:updateOutput(input) + self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi) + return parent.updateOutput(self,{self.img:cuda(), self.roi:cuda(), self.delta_roi:cuda()}) + end + function FixedROIWarpingDeltaROI:updateGradInput(input, gradOutput) + self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi) + return parent.updateGradInput(self,{self.img:cuda(), self.roi:cuda(), self.delta_roi:cuda()}, gradOutput)[3][{{}, {2, 5}}] + end + + testJacobianWithRandomROIForROIWarpingDeltaROI(FixedROIWarpingDeltaROI) +end +]] +---------------------------------------------------------------------- +function testJacobianWithRandomROIForROIWarpingGridGenerator(cls) + --pooling grid size + local w=4; + local h=4; + --img size + local W=w*2; + local H=h*2; + + local batchSize = 3 + local numRoi = batchSize + local numRepeat = 3 + + torch.manualSeed(0) + for i=1,numRepeat do + local img = torch.rand(batchSize, 3, H, W); + --local roi = torch.Tensor{1, 1, 1, W, H}:reshape(1, 5) + local roi = randROI(img:size(), numRoi) + local input = torch.rand(numRoi, 4) + local module = cls.new(h, w, roi) + + local perturbation = 1e-3 + local jac_fprop = jac.forward(module, input, input, 1e-3) + local jac_bprop = jac.backward(module, input) + + local err = jac.testJacobian(module, input, -1, 1, 1e-3) + mytester:assertlt(err, precision, 'error on ROIWarping ') + end +end + +function inntest.ROIWarpingGridGeneratorGridCtrs() + local FixedROIWarpingGridGeneratorGridCtrs, parent = torch.class('FixedROIWarpingGridGeneratorGridCtrs', 'inn.ROIWarpingGridGenerator') + function FixedROIWarpingGridGeneratorGridCtrs:__init(H, W, roi) + parent.__init(self, H, W) + self.roi = roi + self.delta_roi = self.roi:clone() + self.grad_bin_sizes = torch.zeros(roi:size(1), 2) + self:cuda() + end + + function FixedROIWarpingGridGeneratorGridCtrs:updateOutput(input) + self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi) + local tmp = parent.updateOutput(self, {self.roi:cuda(), self.delta_roi:cuda()}) + self.output = self.output or input:cuda().new() + self.output:resizeAs(tmp[1]):copy(tmp[1]) + return self.output + end + function FixedROIWarpingGridGeneratorGridCtrs:updateGradInput(input, gradOutput) + self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi) + self.gradInput = parent.updateGradInput(self,{self.roi:cuda(), self.delta_roi:cuda()}, {gradOutput, self.grad_bin_sizes:cuda()}) + return self.gradInput[2][{{}, {2, 5}}] + end + + testJacobianWithRandomROIForROIWarpingGridGenerator(FixedROIWarpingGridGeneratorGridCtrs) +end + +function inntest.ROIWarpingGridGeneratorBinSizes() + local FixedROIWarpingGridGeneratorBinSizes, parent = torch.class('FixedROIWarpingGridGeneratorBinSizes', 'inn.ROIWarpingGridGenerator') + function FixedROIWarpingGridGeneratorBinSizes:__init(H, W, roi) + parent.__init(self, H, W) + self.roi = roi + self.delta_roi = self.roi:clone() + self.grad_grid_ctrs = torch.zeros(roi:size(1), H, W, 2) + self:cuda() + end + + function FixedROIWarpingGridGeneratorBinSizes:updateOutput(input) + self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi) + local tmp = parent.updateOutput(self, {self.roi:cuda(), self.delta_roi:cuda()}) + self.output = self.output or input:cuda().new() + self.output:resizeAs(tmp[2]):copy(tmp[2]) + return self.output + end + function FixedROIWarpingGridGeneratorBinSizes:updateGradInput(input, gradOutput) + self.delta_roi[{{},{2,5}}] = input:typeAs(self.delta_roi) + self.gradInput = parent.updateGradInput(self,{self.roi:cuda(), self.delta_roi:cuda()}, {self.grad_grid_ctrs:cuda(), gradOutput}) + return self.gradInput[2][{{}, {2, 5}}] + end + + testJacobianWithRandomROIForROIWarpingGridGenerator(FixedROIWarpingGridGeneratorBinSizes) +end + +function testJacobianWithRandomROIForROIWarpingBilinearSampleData(cls) + --pooling grid size + local w=4; + local h=4; + --input size + local W=w*2; + local H=h*2; + + local batchSize = 3 + local numRoi = batchSize + local numRepeat = 3 + + torch.manualSeed(0) + for i=1,numRepeat do + local input = torch.rand(batchSize, 1, H, W); + local roi = randROI(input:size(), numRoi) + local delta_roi = roi:clone() + delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4) + local pred_rois = delta_rois_to_rois(roi[{{}, {2, 5}}], delta_roi[{{}, {2, 5}}]) + + local preprocess = inn.ROIWarpingGridGenerator(h, w); preprocess:cuda() + local output = preprocess:forward({roi:cuda(), delta_roi:cuda()}) + local grid_ctrs = output[1]:clone() + local bin_sizes = output[2]:clone() + local roi_batch_inds = output[3]:clone() + + local module = cls.new(h, w, grid_ctrs, bin_sizes, roi_batch_inds) + local err = jac.testJacobian(module, input, nil, nil, 1e-3) + mytester:assertlt(err, precision, 'error on ROIWarpingBilinearSampleData ') + end +end + +function inntest.ROIWarpingBlinearSampleData() + local FixedROIWarpingBilinearSampleData, parent = torch.class('FixedROIWarpingBilinearSampleData', 'inn.ROIWarpingBilinearSample') + function FixedROIWarpingBilinearSampleData:__init(H, W, grid_ctrs, bin_sizes, roi_batch_inds) + self.grid_ctrs = grid_ctrs:clone() + self.bin_sizes = bin_sizes:clone() + self.roi_batch_inds = roi_batch_inds:clone() + parent.__init(self, H, W) + self:cuda() + end + + function FixedROIWarpingBilinearSampleData:updateOutput(input) + return parent.updateOutput(self, {input:cuda(), self.grid_ctrs:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}) + end + function FixedROIWarpingBilinearSampleData:updateGradInput(input, gradOutput) + return parent.updateGradInput(self, {input:cuda(), self.grid_ctrs:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[1] + end + + testJacobianWithRandomROIForROIWarpingBilinearSampleData(FixedROIWarpingBilinearSampleData) +end + +function testJacobianWithRandomROIForROIWarpingBilinearSampleGridCtrs(cls) + --pooling grid size + local w=4; + local h=4; + --input size + local W=w*2; + local H=h*2; + + local batchSize = 3 + local numRoi = batchSize + local numRepeat = 3 + + torch.manualSeed(0) + for i=1,numRepeat do + local img = torch.rand(batchSize, 1, H, W); + local roi = randROI(img:size(), numRoi) + local delta_roi = roi:clone() + delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4) + local pred_rois = delta_rois_to_rois(roi[{{}, {2, 5}}], delta_roi[{{}, {2, 5}}]) + + local preprocess = inn.ROIWarpingGridGenerator(h, w); preprocess:cuda() + local output = preprocess:forward({roi:cuda(), delta_roi:cuda()}) + local input = output[1]:clone() -- local grid_ctrs = output[1]:clone() + local bin_sizes = output[2]:clone() + local roi_batch_inds = output[3]:clone() + + --print(input:select(4,1)) + --print(input:select(4,2)) + print(roi) + print(delta_roi) + print(pred_rois) + --print(grid_ctrs:select(4, 1)) + --print(grid_ctrs:select(4, 2)) + --print(bin_sizes) + --print(roi_batch_inds) + + local module = cls.new(h, w, img, bin_sizes, roi_batch_inds) + + local err = jac.testJacobian(module, input, nil, nil, 1e-3) + -- for debug + --if err > precision then + -- local jac_fprop = jac.forward(module, input, nil, 1e-3) + -- local jac_bprop = jac.backward(module, input) + -- print(jac_fprop) + -- print(jac_bprop) + -- local err = jac_fprop-jac_bprop + -- local val, index = torch.max(err:view(-1):abs(), 1) + -- print(val) + -- print(index) + -- print(input:numel()) + -- print(input:size()) + -- print(pred_rois) + -- --print(input) + -- print(input:select(4,1)) + -- print(grid_ctrs:select(4,1)) + -- --print(input:select(4,2)) + --end + -- til here + mytester:assertlt(err, precision, 'error on ROIWarpingBilinearSample ') + end +end + +function inntest.ROIWarpingBlinearSampleGridCtrs() + local FixedROIWarpingBilinearSampleGridCtrs, parent = torch.class('FixedROIWarpingBilinearSampleGridCtrs', 'inn.ROIWarpingBilinearSample') + function FixedROIWarpingBilinearSampleGridCtrs:__init(H, W, img, bin_sizes, roi_batch_inds) + self.img = img:clone() + self.bin_sizes = bin_sizes:clone() + self.roi_batch_inds = roi_batch_inds:clone() + parent.__init(self, H, W) + self:cuda() + end + + function FixedROIWarpingBilinearSampleGridCtrs:updateOutput(input) + return parent.updateOutput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}) + end + function FixedROIWarpingBilinearSampleGridCtrs:updateGradInput(input, gradOutput) + return parent.updateGradInput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[2] + end + + testJacobianWithRandomROIForROIWarpingBilinearSampleGridCtrs(FixedROIWarpingBilinearSampleGridCtrs) +end +--[[ +function testJacobianWithRandomROIForROIWarpingBilinearSampleBinSizes(cls) + + --pooling grid size + local w=4; + local h=4; + --input size + local W=w*2; + local H=h*2; + + local batchSize = 3 + local numRoi = batchSize + local numRepeat = 3 + + torch.manualSeed(0) + for i=1,numRepeat do + local img = torch.rand(batchSize, 1, H, W); + local roi = randROI(img:size(), numRoi) + local delta_roi = roi:clone() + delta_roi[{{}, {2, 5}}] = torch.rand(numRoi, 4) + local pred_rois = delta_rois_to_rois(roi[{{}, {2, 5}}], delta_roi[{{}, {2, 5}}]) + + local preprocess = inn.ROIWarpingGridGenerator(h, w); preprocess:cuda() + local output = preprocess:forward({roi:cuda(), delta_roi:cuda()}) + local grid_ctrs = output[1]:clone() + local bin_sizes = output[2]:clone() + local roi_batch_inds = output[3]:clone() + + local input = bin_sizes:clone() + + --print(input:select(4,1)) + --print(input:select(4,2)) + --print(roi) + --print(delta_roi) + --print(pred_rois) + --print(grid_ctrs:select(4, 1)) + --print(grid_ctrs:select(4, 2)) + --print(bin_sizes) + --print(roi_batch_inds) + + local module = cls.new(h, w, img, grid_ctrs, roi_batch_inds) + + local err = jac.testJacobian(module, input, nil, nil, 1e-3) + -- for debug + if err > precision then + local jac_fprop = jac.forward(module, input, nil, 1e-3) + local jac_bprop = jac.backward(module, input) + print(jac_fprop) + print(jac_bprop) + local err = jac_fprop-jac_bprop + local val, index = torch.max(err:view(-1):abs(), 1) + print(val) + print(index) + print(input:numel()) + print(input:size()) + print(pred_rois) + print(input) + print(bin_sizes) + end + -- til here + mytester:assertlt(err, precision, 'error on ROIWarpingBilinearSample ') + end +end + +function inntest.ROIWarpingBlinearSampleBinSizes() + local FixedROIWarpingBilinearSampleBinSizes, parent = torch.class('FixedROIWarpingBilinearSampleBinSizes', 'inn.ROIWarpingBilinearSample') + function FixedROIWarpingBilinearSampleBinSizes:__init(H, W, img, grid_ctrs, roi_batch_inds) + self.img = img:clone() + self.grid_ctrs = grid_ctrs:clone() + self.roi_batch_inds = roi_batch_inds:clone() + parent.__init(self, H, W) + self:cuda() + end + + function FixedROIWarpingBilinearSampleBinSizes:updateOutput(input) + return parent.updateOutput(self, {self.img:cuda(), self.grid_ctrs:cuda(), input:cuda(), self.roi_batch_inds:cuda()}) + end + function FixedROIWarpingBilinearSampleBinSizes:updateGradInput(input, gradOutput) + return parent.updateGradInput(self, {self.img:cuda(), self.grid_ctrs:cuda(), input:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[3] + end + + testJacobianWithRandomROIForROIWarpingBilinearSampleBinSizes(FixedROIWarpingBilinearSampleBinSizes) +end +]] + jac = nn.Jacobian mytester:add(inntest) mytester:run() diff --git a/test/test_roiwarping.lua b/test/test_roiwarping.lua new file mode 100644 index 0000000..ab4d3d1 --- /dev/null +++ b/test/test_roiwarping.lua @@ -0,0 +1,297 @@ +local function delta_rois_to_rois(rois, delta_rois) + local src_w = rois[{{},3}] - rois[{{},1}] + 1; + local src_h = rois[{{},4}] - rois[{{},2}] + 1; + local src_ctr_x = rois[{{},1}] + 0.5*(src_w-1.0); + local src_ctr_y = rois[{{},2}] + 0.5*(src_h-1.0); + + local dst_ctr_x = delta_rois[{{},1}]; -- dx (in fast-rcnn notation) = cx (in here) + local dst_ctr_y = delta_rois[{{},2}]; -- dy (in fast-rcnn notation) = cy (in here) + local dst_scl_x = delta_rois[{{},3}]; -- dw (in fast-rcnn notation) = sx (in here) + local dst_scl_y = delta_rois[{{},4}]; -- dh (in fast-rcnn notation) = sy (in here) + + local pred_ctr_x = torch.cmul(dst_ctr_x, src_w) + src_ctr_x; + local pred_ctr_y = torch.cmul(dst_ctr_y, src_h) + src_ctr_y; + local pred_w = torch.cmul(torch.exp(dst_scl_x), src_w); + local pred_h = torch.cmul(torch.exp(dst_scl_y), src_h); + + local roi_start_w = pred_ctr_x - 0.5*(pred_w-1) + local roi_start_h = pred_ctr_y - 0.5*(pred_h-1) + local roi_end_w = pred_ctr_x + 0.5*(pred_w-1) + local roi_end_h = pred_ctr_y + 0.5*(pred_h-1) + + return torch.cat({roi_start_w, roi_start_h, roi_end_w, roi_end_h}, 2) +end + +local inn = require 'inn' +local nn = require 'nn' + +torch.manualSeed(3) + +local n_images = 1 -- 2 +local channels = 3 +local H = 4 +local W = 3 +local height = 3 +local width = 6 + +local sz = torch.Tensor{channels, height, width} +local input_image = torch.Tensor(n_images, sz[1], sz[2], sz[3]):copy(torch.linspace(1, n_images * sz[1] * sz[2] * sz[3], n_images * sz[1] * sz[2] * sz[3]):reshape(n_images, sz[1], sz[2], sz[3])) + +print(input_image) + +local n_rois = 1 --3 --10 +local rois=torch.Tensor(n_rois,5) +for i=1,n_rois do + idx=torch.randperm(n_images)[1] + y=torch.randperm(sz[2])[{{1,2}}]:sort() + x=torch.randperm(sz[3])[{{1,2}}]:sort() + rois[{i,{}}] = torch.Tensor({idx,x[1],y[1],x[2],y[2]}) + --rois[{i,{}}] = torch.Tensor({idx,1,1,sz[3],sz[2]}) + --rois[{i,{}}] = torch.Tensor({idx,1,1,H/2,W/2}) + --rois[{i,{}}] = torch.Tensor({idx,1,1,1,1}) +end +--rois[{1,{}}] = torch.Tensor({1,2,2,3,5}) +--rois[{2,{}}] = torch.Tensor({1,1,5,3,6}) +--rois[{1,{}}] = torch.Tensor({1,1,5,3,6}) +--rois[{{}, {}}] = torch.Tensor({{1, 3, 1, 6, 5}, +-- {3, 4, 4, 5, 8}, +-- {2, 1, 1, 3, 5}}) +--print(rois) + +local model = inn.ROIPooling(W,H) +model.v2 = false +model:cuda() + +local output = model:forward({input_image:cuda(), rois:cuda()}) +--print(output) + +local model = inn.ROIWarping(W,H) +model:cuda() +--local output = model:forward({input_image:cuda(), rois:cuda()}) +--print(output) + +--------------- +print('-------------------------') +local delta_rois = rois:clone() +--delta_rois[{{}, {2,5}}] = 0 +--delta_rois[{{}, {2,5}}] = torch.ones(n_rois, 4) * 0.1 +--delta_rois[{{}, {2,5}}] = 0.1 * torch.rand(n_rois, 4) +delta_rois[{{}, {2,5}}] = torch.rand(n_rois, 4) +--delta_rois[{{}, {2,5}}] = torch.Tensor{0.7887, 0.4103, 0.7086, 0.7714}:reshape(1,4) +--delta_rois[{{}, {2,5}}] = torch.Tensor{0.4694, 0.1311, 0.8265, 0.1495, 0.9336, 0.4434, 0.5211, 0.1230}:reshape(2,4) +--delta_rois[{{}, {2,5}}] = torch.Tensor{0.4694, 0.1311, 0.8265, 0.1495}:reshape(1,4) +--delta_rois[{{}, {2,5}}] = torch.Tensor{0.9336, 0.4434, 0.5211, 0.1230}:reshape(1,4) +--delta_rois[{{}, {}}] = torch.Tensor({{1.0000, 0.7253, 0.6597, 0.5013, 0.5332}, +-- {3.0000, 0.9561, 0.2305, 0.6440, 0.3949}, +-- {2.0000, 0.4239, 0.6188, 0.6064, 0.4749}}) +-- +print(rois) +print(delta_rois) +local pred_rois = delta_rois_to_rois(rois[{{}, {2,5}}], delta_rois[{{}, {2,5}}]) +print(pred_rois) +print(torch.round(pred_rois)) +--[[ +local output = model:forward({input_image:cuda(), rois:cuda(), delta_rois:cuda()}) +--local output = model:forward({input_image:clone():fill(1):cuda(), rois:cuda(), delta_rois:cuda()}) +print(output) +print(output:sum()) + +print('-------------------------') +local gradOutput = torch.ones(n_rois, channels, H, W):cuda() --torch.rand(n_rois, channels, H, W):cuda() --torch.Tensor(n_rois, channels, 3, 3):fill(1) +local gradInput = model:backward({input_image:cuda(), rois:cuda(), delta_rois:cuda()}, gradOutput) +--local gradInput = model:backward({input_image:clone():fill(1):cuda(), rois:cuda(), delta_rois:cuda()}, gradOutput) +print(gradInput[1]) +print(gradInput[1]:sum()) +print(gradInput[2]) +print(gradInput[3]) +print(gradInput[3]:sum()) +]] + +--print('------------------------------------------------------------') +local model = inn.ROIWarpingGridGenerator(H, W) +model:cuda() +local output = model:forward({rois:cuda(), delta_rois:cuda()}) +--print(output[1]:select(4,1)) +--print(output[1]:select(4,2)) +--print(output[2]) +--print(output[3]) +local grid_ctrs = output[1]:clone() +local bin_sizes = output[2]:clone() +local roi_batch_inds = output[3]:clone() + +local gradOutput = {torch.ones(n_rois, H, W, 2):cuda(), --torch.rand(n_rois, channels, H, W, 2):cuda() + torch.ones(n_rois, 2):cuda()} +local gradInput = model:backward({rois:cuda(), delta_rois:cuda()}, gradOutput) +--print(gradInput[1]) +--print(gradInput[2]) + +print('------------------------------------------------------------') +--local input_image = 10 * torch.rand(input_image:size()) --torch.ones(input_image:size()) +local input_image = torch.ones(input_image:size()) +local model = inn.ROIWarpingBilinearSample(H, W) +model:cuda() +local output = model:forward({input_image:cuda(), grid_ctrs:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()}) +--print(output) +--print(output:sum()) + +--print('hi0000000000000000000000') +local gradOutput = torch.ones(n_rois, channels, H, W):cuda() +local gradInput = model:backward({input_image:cuda(), grid_ctrs:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()}, gradOutput) +--print(gradInput) +--print(gradInput[1]) +--print(gradInput[1]:sum()) +--print(gradInput[2]:select(4,1)/3) +--print(gradInput[2]:select(4,2)/3) +--print(gradInput[2]:sum()) +--print(gradInput[3]) +--print(gradInput[4]) + +print('------------------------------------------------------------') +local FixedROIWarpingBilinearSampleGridCtrs, parent = torch.class('FixedROIWarpingBilinearSampleGridCtrs', 'inn.ROIWarpingBilinearSample') +function FixedROIWarpingBilinearSampleGridCtrs:__init(H, W, img, bin_sizes, roi_batch_inds) + self.img = img:clone() + self.bin_sizes = bin_sizes:clone() + self.roi_batch_inds = roi_batch_inds:clone() + parent.__init(self, H, W) + self:cuda() +end + +function FixedROIWarpingBilinearSampleGridCtrs:updateOutput(input) + return parent.updateOutput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}) +end +function FixedROIWarpingBilinearSampleGridCtrs:updateGradInput(input, gradOutput) + return parent.updateGradInput(self, {self.img:cuda(), input:cuda(), self.bin_sizes:cuda(), self.roi_batch_inds:cuda()}, gradOutput:cuda())[2] +end + +local jac = nn.Jacobian + +local module = FixedROIWarpingBilinearSampleGridCtrs.new(H, W, input_image, bin_sizes, roi_batch_inds) +local input = grid_ctrs:clone() + +local err = jac.testJacobian(module, input, nil, nil, 1e-3) + +local ind = 24 + + + + + + + + + + +local function jacforward(module, input, param, perturbation) + param = param or input + -- perturbation amount + perturbation = perturbation or 1e-6 + -- 1D view of input + --local tst = param:storage() + local sin = param.new(param):resize(param:nElement())--param.new(tst,1,tst:size()) + -- jacobian matrix to calculate + local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement()) + + print('hi11') + print('input: ') + print(input) + print('param: ') + print(param) + print('sin: ') + print(sin) + + local outa = torch.Tensor(jacobian:size(2)) + local outb = torch.Tensor(jacobian:size(2)) + + for i=1,sin:nElement() do + local orig = sin[i] + if i == ind then + print(orig) + end + + sin[i] = orig - perturbation + outa:copy(module:forward(input)) + if i == ind then + print(sin[i]) + end + + sin[i] = orig + perturbation + outb:copy(module:forward(input)) + if i == ind then + print(sin[i]) + end + + sin[i] = orig + + if i == ind then + print(outa) + print(outb) + print(outb-outa) + end + + outb:add(-1,outa):div(2*perturbation) + + if i == 5 then + print(outb) + end + + jacobian:select(1,i):copy(outb) + end + + return jacobian +end + + +--local input = grid_ctrs:clone() +print(input) +print(grid_ctrs) +if err > 0.001 then + print('error!!!!!!!!!!!!!!!') + print(err) + --local jac_fprop = jac.forward(module, input, nil, 1e-3) + local jac_fprop = jacforward(module, input, nil, 1e-3) + local jac_bprop = jac.backward(module, input) -- input:numel() x output:numel() + print(jac_fprop) + print(jac_bprop) + local err = jac_fprop-jac_bprop + local val, index = torch.max(err:view(-1):abs(), 1) + print(val) + print(index) + --print(input:numel()) --print(input:nElement()) + print('input img: ') + print(input_image:size()) + print(input_image) + print('pred_rois: ') + print(pred_rois) + print(bin_sizes) + --print(input) + --print(input:select(4,1)) + --print(input:select(4,2)) + --print(grid_ctrs:select(4,1)) + --print(grid_ctrs:select(4,2)) + --print(grid_ctrs:view(-1)) + --print(grid_ctrs:view(-1):select(1,4)) + + print('test error!!!!!!!!!!!!!!!!!!!!!!!!!!') + local tmp = input:clone() --grid_ctrs:clone() + local tmp2 = tmp:view(-1) + local orig = tmp2[ind] + --local ind = 5 + + tmp2[ind] = orig + 1e-3 + local output1 = model:forward({input_image:cuda(), tmp:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()}):clone() + print('grid_ctrs1: ') + print(tmp:select(4,1)) + print(tmp:select(4,2)) + print('output1: ') + print(output1) + + tmp2[ind] = orig - 1e-3 + local output2 = model:forward({input_image:cuda(), tmp:cuda(), bin_sizes:cuda(), roi_batch_inds:cuda()}) + print('grid_ctrs2: ') + print(tmp:select(4,1)) + print(tmp:select(4,2)) + print('output2: ') + print(output2) + print(torch.sqrt(torch.pow(output1:view(-1) - output2:view(-1),2))) + print(torch.sum(torch.sqrt(torch.pow(output1:view(-1) - output2:view(-1),2)))) +end