From 86184157b9e0719dba019c78769b89788e2ce432 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Sat, 9 Feb 2019 01:23:17 +0900
Subject: [PATCH 01/24] rename psroi_pooling_2d -> ps_roi_average_pooling_2d

---
 .../links/model/fcis/fcis_resnet101.py            |  6 +++---
 chainercv/functions/__init__.py                   |  3 +--
 ...pooling_2d.py => ps_roi_average_pooling_2d.py} | 14 +++++++-------
 ...ng_2d.py => test_ps_roi_average_pooling_2d.py} | 15 ++++++++-------
 4 files changed, 19 insertions(+), 19 deletions(-)
 rename chainercv/functions/{psroi_pooling_2d.py => ps_roi_average_pooling_2d.py} (97%)
 rename tests/functions_tests/{test_psroi_pooling_2d.py => test_ps_roi_average_pooling_2d.py} (89%)

diff --git a/chainercv/experimental/links/model/fcis/fcis_resnet101.py b/chainercv/experimental/links/model/fcis/fcis_resnet101.py
index c706fe68c6..70e5419d13 100644
--- a/chainercv/experimental/links/model/fcis/fcis_resnet101.py
+++ b/chainercv/experimental/links/model/fcis/fcis_resnet101.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 from chainercv.experimental.links.model.fcis import FCIS
-from chainercv.functions import psroi_pooling_2d
+from chainercv.functions import ps_roi_average_pooling_2d
 from chainercv.links import Conv2DBNActiv
 from chainercv.links.model.faster_rcnn.region_proposal_network import \
     RegionProposalNetwork
@@ -365,7 +365,7 @@ def _pool(
             self, h_cls_seg, h_ag_loc, rois, roi_indices, gt_roi_labels):
         # PSROI Pooling
         # shape: (n_roi, n_class, 2, roi_size, roi_size)
-        roi_cls_ag_seg_scores = psroi_pooling_2d(
+        roi_cls_ag_seg_scores = ps_roi_average_pooling_2d(
             h_cls_seg, rois, roi_indices,
             self.n_class * 2, self.roi_size, self.roi_size,
             self.spatial_scale, self.group_size)
@@ -374,7 +374,7 @@ def _pool(
             (-1, self.n_class, 2, self.roi_size, self.roi_size))
 
         # shape: (n_roi, 2*4, roi_size, roi_size)
-        roi_ag_loc_scores = psroi_pooling_2d(
+        roi_ag_loc_scores = ps_roi_average_pooling_2d(
             h_ag_loc, rois, roi_indices,
             2 * 4, self.roi_size, self.roi_size,
             self.spatial_scale, self.group_size)
diff --git a/chainercv/functions/__init__.py b/chainercv/functions/__init__.py
index 5e5e7dde05..71fa9b6802 100644
--- a/chainercv/functions/__init__.py
+++ b/chainercv/functions/__init__.py
@@ -1,2 +1 @@
-from chainercv.functions.psroi_pooling_2d import psroi_pooling_2d  # NOQA
-from chainercv.functions.psroi_pooling_2d import PSROIPooling2D  # NOQA
+from chainercv.functions.ps_roi_average_pooling_2d import ps_roi_average_pooling_2d  # NOQA
diff --git a/chainercv/functions/psroi_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
similarity index 97%
rename from chainercv/functions/psroi_pooling_2d.py
rename to chainercv/functions/ps_roi_average_pooling_2d.py
index 4944d9433d..08b2518927 100644
--- a/chainercv/functions/psroi_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -47,7 +47,7 @@ def _roi_pooling_slice(size, stride, max_size, roi_offset):
     return slice(start, end), end - start
 
 
-class PSROIPooling2D(function.Function):
+class PSROIAveragePooling2D(function.Function):
 
     def __init__(self, out_c, out_h, out_w, spatial_scale, group_size):
         self.out_c, self.out_h, self.out_w = out_c, out_h, out_w
@@ -197,7 +197,7 @@ def forward_gpu(self, inputs):
 
             float bin_area = (hend - hstart) * (wend - wstart);
             top_data = is_empty? (float) 0. : out_sum / bin_area;
-            ''', 'psroi_pooling_2d_fwd'
+            ''', 'ps_roi_average_pooling_2d_fwd'
         )(bottom_data, bottom_rois, bottom_roi_indices,
           self.spatial_scale, channels, height, width,
           self.out_c, self.out_h, self.out_w, self.group_size,
@@ -327,7 +327,7 @@ def backward_gpu(self, inputs, gy):
                     &bottom_diff[bottom_diff_offset + bottom_index], diff_val);
               }
             }
-            ''', 'psroi_pooling_2d_bwd'
+            ''', 'ps_roi_average_pooling_2d_bwd'
         )(gy[0], bottom_rois, bottom_roi_indices,
           self.spatial_scale, channels, height, width,
           self.out_c, self.out_h, self.out_w,
@@ -336,11 +336,11 @@ def backward_gpu(self, inputs, gy):
         return bottom_diff, None, None
 
 
-def psroi_pooling_2d(
+def ps_roi_average_pooling_2d(
         x, rois, roi_indices, out_c, out_h, out_w,
         spatial_scale, group_size
 ):
-    """Position Sensitive Region of Interest (ROI) pooling function.
+    """Position Sensitive Region of Interest (ROI) Average pooling function.
 
     This function computes position sensitive average of input spatial patch
     with the given region of interests. Each ROI is splitted into
@@ -368,5 +368,5 @@ def psroi_pooling_2d(
     `R-FCN <https://arxiv.org/abs/1605.06409>`_.
 
     """
-    return PSROIPooling2D(out_c, out_h, out_w, spatial_scale,
-                          group_size)(x, rois, roi_indices)
+    return PSROIAveragePooling2D(out_c, out_h, out_w, spatial_scale,
+                                 group_size)(x, rois, roi_indices)
diff --git a/tests/functions_tests/test_psroi_pooling_2d.py b/tests/functions_tests/test_ps_roi_average_pooling_2d.py
similarity index 89%
rename from tests/functions_tests/test_psroi_pooling_2d.py
rename to tests/functions_tests/test_ps_roi_average_pooling_2d.py
index 59c66562d5..2809b9bf9d 100644
--- a/tests/functions_tests/test_psroi_pooling_2d.py
+++ b/tests/functions_tests/test_ps_roi_average_pooling_2d.py
@@ -10,7 +10,7 @@
 from chainercv import functions
 
 
-class TestPSROIPolling2D(unittest.TestCase):
+class TestPSROIAveragePolling2D(unittest.TestCase):
 
     def setUp(self):
         self.N = 3
@@ -43,7 +43,7 @@ def check_forward(self, x_data, roi_data, roi_index_data):
         x = chainer.Variable(x_data)
         rois = chainer.Variable(roi_data)
         roi_indices = chainer.Variable(roi_index_data)
-        y = functions.psroi_pooling_2d(
+        y = functions.ps_roi_average_pooling_2d(
             x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
             self.spatial_scale, self.group_size)
         self.assertEqual(y.data.dtype, np.float32)
@@ -63,11 +63,12 @@ def test_forward_gpu(self):
             cuda.to_gpu(self.roi_indices))
 
     def check_backward(self, x_data, roi_data, roi_index_data, y_grad_data):
+        def f(x, rois, roi_indices):
+            return functions.ps_roi_average_pooling_2d(
+                x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
+                self.spatial_scale, self.group_size)
         gradient_check.check_backward(
-            functions.PSROIPooling2D(
-                self.out_c, self.out_h, self.out_w,
-                self.spatial_scale, self.group_size),
-            (x_data, roi_data, roi_index_data), y_grad_data,
+            f, (x_data, roi_data, roi_index_data), y_grad_data,
             no_grads=[False, True, True], **self.check_backward_options)
 
     @condition.retry(3)
@@ -85,7 +86,7 @@ def apply_backward(self, x_data, roi_data, roi_index_data, y_grad_data):
         x = chainer.Variable(x_data)
         rois = chainer.Variable(roi_data)
         roi_indices = chainer.Variable(roi_index_data)
-        y = functions.psroi_pooling_2d(
+        y = functions.ps_roi_average_pooling_2d(
             x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
             self.spatial_scale, self.group_size)
         x.cleargrad()

From d988ad6c8284307fdc51f6186af6e115d7fcd720 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Sat, 9 Feb 2019 01:55:48 +0900
Subject: [PATCH 02/24] add ps_roi_average_align_2d

---
 chainercv/functions/__init__.py               |   1 +
 .../functions/ps_roi_average_align_2d.py      | 629 ++++++++++++++++++
 .../test_ps_roi_average_align_2d.py           | 115 ++++
 3 files changed, 745 insertions(+)
 create mode 100644 chainercv/functions/ps_roi_average_align_2d.py
 create mode 100644 tests/functions_tests/test_ps_roi_average_align_2d.py

diff --git a/chainercv/functions/__init__.py b/chainercv/functions/__init__.py
index 71fa9b6802..a78f903a83 100644
--- a/chainercv/functions/__init__.py
+++ b/chainercv/functions/__init__.py
@@ -1 +1,2 @@
+from chainercv.functions.ps_roi_average_align_2d import ps_roi_average_align_2d  # NOQA
 from chainercv.functions.ps_roi_average_pooling_2d import ps_roi_average_pooling_2d  # NOQA
diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
new file mode 100644
index 0000000000..d3b29f3ebc
--- /dev/null
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -0,0 +1,629 @@
+# Modified work:
+# -----------------------------------------------------------------------------
+# Copyright (c) 2018 Preferred Infrastructure, Inc.
+# Copyright (c) 2018 Preferred Networks, Inc.
+# -----------------------------------------------------------------------------
+
+# Original work:
+# -----------------------------------------------------------------------------
+# Copyright (c) 2015 by Contributors
+# \file roi_pooling.cu
+# \brief roi pooling operator
+# \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
+# \changed to roi_align by Elaine Bao
+# \file roi_align.cu
+# \roi align operator described in Mask RCNN
+# -----------------------------------------------------------------------------
+
+from __future__ import division
+
+import numpy as np
+import six
+
+import chainer
+from chainer.backends import cuda
+from chainer import function
+from chainer.utils import type_check
+
+
+def _pair(x):
+    if isinstance(x, chainer.utils.collections_abc.Iterable):
+        return x
+    return x, x
+
+
+def _get_bilinear_interp_params(y, x, height, width):
+    if y < -1 or y > height or x < -1 or x > width:
+        # out of range, so it is empty
+        return (None,) * 8
+
+    if y <= 0:
+        y = 0
+    if x <= 0:
+        x = 0
+
+    y_low = int(y)
+    x_low = int(x)
+
+    if y_low >= height - 1:
+        y_high = y_low = height - 1
+        y = float(y_low)
+    else:
+        y_high = y_low + 1
+
+    if x_low >= width - 1:
+        x_high = x_low = width - 1
+        x = float(x_low)
+    else:
+        x_high = x_low + 1
+
+    ly = y - y_low
+    lx = x - x_low
+    hy = 1. - ly
+    hx = 1. - lx
+
+    w1 = hy * hx
+    w2 = hy * lx
+    w3 = ly * hx
+    w4 = ly * lx
+
+    return y_low, x_low, y_high, x_high, w1, w2, w3, w4
+
+
+_GET_BILINEAR_INTERP_KERNEL = '''
+__device__
+bool get_bilinear_interp_params(
+    T x, T y, const int height, const int width,
+    int &y_low, int &x_low, int &y_high, int &x_high,
+    T &w1, T &w2, T &w3, T &w4) {
+    // deal with cases that inverse elements are
+    // out of feature map boundary
+    if (y < -1. || y > height || x < -1. || x > width) {
+        // empty
+        return false;
+    }
+
+    if (y <= 0) {
+        y = 0;
+    }
+    if (x <= 0) {
+        x = 0;
+    }
+
+    y_low = (int)y;
+    x_low = (int)x;
+
+    if (y_low >= height - 1) {
+        y_high = y_low = height - 1;
+        y = (T)y_low;
+    } else {
+        y_high = y_low + 1;
+    }
+
+    if (x_low >= width - 1) {
+        x_high = x_low = width - 1;
+        x = (T)x_low;
+    } else {
+        x_high = x_low + 1;
+    }
+
+    T ly = y - y_low;
+    T lx = x - x_low;
+    T hy = 1. - ly;
+    T hx = 1. - lx;
+
+    w1 = hy * hx;
+    w2 = hy * lx;
+    w3 = ly * hx;
+    w4 = ly * lx;
+
+    return true;
+}
+'''
+
+
+class PSROIAverageAlign2D(function.Function):
+
+    def __init__(
+            self, out_c, out_h, out_w, spatial_scale,
+            group_size, sampling_ratio=None
+    ):
+        if not (isinstance(out_c, int) and out_c > 0):
+            raise TypeError(
+                'out_c must be positive integer: {}, {}'
+                .format(type(out_c), out_c))
+        if not (isinstance(out_h, int) and out_h > 0):
+            raise TypeError(
+                'out_h must be positive integer: {}, {}'
+                .format(type(out_h), out_h))
+        if not (isinstance(out_w, int) and out_w > 0):
+            raise TypeError(
+                'out_w must be positive integer: {}, {}'
+                .format(type(out_w), out_w))
+        if isinstance(spatial_scale, int):
+            spatial_scale = float(spatial_scale)
+        if not (isinstance(group_size, int) and group_size > 0):
+            raise TypeError(
+                'group_size must be positive integer: {}, {}'
+                .format(type(group_size), group_size))
+        if not (isinstance(spatial_scale, float) and spatial_scale > 0):
+            raise TypeError(
+                'spatial_scale must be a positive float number: {}, {}'
+                .format(type(spatial_scale), spatial_scale))
+        sampling_ratio = _pair(sampling_ratio)
+        if not all((isinstance(s, int) and s >= 1) or s is None
+                   for s in sampling_ratio):
+            raise TypeError(
+                'sampling_ratio must be integer >= 1 or a pair of it: {}'
+                .format(sampling_ratio))
+
+        self.out_c, self.out_h, self.out_w = out_c, out_h, out_w
+        self.spatial_scale = spatial_scale
+        self.group_size = group_size
+        self.sampling_ratio = sampling_ratio
+
+    def check_type_forward(self, in_types):
+        type_check.expect(in_types.size() == 3)
+
+        x_type, roi_type, roi_index_type = in_types
+        type_check.expect(
+            x_type.dtype == np.float32,
+            x_type.ndim == 4,
+            roi_type.dtype == np.float32,
+            roi_type.ndim == 2,
+            roi_type.shape[1] == 4,
+            roi_index_type.dtype == np.int32,
+            roi_index_type.ndim == 1,
+            roi_type.shape[0] == roi_index_type.shape[0]
+        )
+
+    def forward_cpu(self, inputs):
+        self.retain_inputs((1, 2))
+        self._bottom_data_shape = inputs[0].shape
+
+        bottom_data, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = bottom_data.shape[1:]
+        n_roi = bottom_rois.shape[0]
+        top_data = np.empty(
+            (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
+
+        group_size = self.group_size
+        pooled_dim, pooled_width, pooled_height \
+            = self.out_c, self.out_w, self.out_h
+        spatial_scale = self.spatial_scale
+
+        for i in six.moves.range(top_data.size):
+            pw = i % pooled_width
+            ph = int(i / pooled_width) % pooled_height
+            ctop = int(i / pooled_width / pooled_height) % pooled_dim
+            n = int(i / pooled_width / pooled_height / pooled_dim)
+
+            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_start_h = bottom_rois[n, 0] * spatial_scale
+            roi_start_w = bottom_rois[n, 1] * spatial_scale
+            roi_end_h = bottom_rois[n, 2] * spatial_scale
+            roi_end_w = bottom_rois[n, 3] * spatial_scale
+
+            roi_height = max(roi_end_h - roi_start_h, 1.)
+            roi_width = max(roi_end_w - roi_start_w, 1.)
+            bin_size_h = 1. * roi_height / pooled_height
+            bin_size_w = 1. * roi_width / pooled_width
+
+            gh = np.floor(float(ph) * group_size / pooled_height)
+            gw = np.floor(float(pw) * group_size / pooled_width)
+            gh = int(min(max(gh, 0), group_size - 1))
+            gw = int(min(max(gw, 0), group_size - 1))
+            c = (ctop * group_size + gh) * group_size + gw
+
+            if self.sampling_ratio[0] is None:
+                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+            else:
+                roi_bin_grid_h = self.sampling_ratio[0]
+            if self.sampling_ratio[1] is None:
+                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+            else:
+                roi_bin_grid_w = self.sampling_ratio[1]
+
+            count = roi_bin_grid_h * roi_bin_grid_w
+
+            output_val = 0.
+            iy = 0
+            while iy < roi_bin_grid_h:
+                y = roi_start_h + ph * bin_size_h + \
+                    (iy + .5) * bin_size_h / roi_bin_grid_h
+                ix = 0
+                while ix < roi_bin_grid_w:
+                    x = roi_start_w + pw * bin_size_w + \
+                        (ix + .5) * bin_size_w / roi_bin_grid_w
+
+                    # bilinear interpolation {{
+                    y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
+                        _get_bilinear_interp_params(y, x, height, width)
+                    if y_low is None:
+                        continue
+
+                    v1 = bottom_data[roi_batch_ind, c, y_low, x_low]
+                    v2 = bottom_data[roi_batch_ind, c, y_low, x_high]
+                    v3 = bottom_data[roi_batch_ind, c, y_high, x_low]
+                    v4 = bottom_data[roi_batch_ind, c, y_high, x_high]
+
+                    output_val += w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+                    # }}
+
+                    ix += 1
+                iy += 1
+
+            output_val /= count
+            top_data[n, ctop, ph, pw] = output_val
+
+        return top_data,
+
+    def forward_gpu(self, inputs):
+        self.retain_inputs((1, 2))
+        self._bottom_data_shape = inputs[0].shape
+
+        bottom_data, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = bottom_data.shape[1:]
+        n_roi = bottom_rois.shape[0]
+        top_data = cuda.cupy.empty(
+            (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
+        if self.sampling_ratio[0] is None:
+            sampling_ratio_h = 0
+        else:
+            sampling_ratio_h = self.sampling_ratio[0]
+        if self.sampling_ratio[1] is None:
+            sampling_ratio_w = 0
+        else:
+            sampling_ratio_w = self.sampling_ratio[1]
+        cuda.elementwise(
+            '''
+            raw T bottom_data, T spatial_scale, int32 channels,
+            int32 height, int32 width,
+            int32 pooled_dim, int32 pooled_height, int32 pooled_width,
+            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w,
+            raw T bottom_rois, raw int32 bottom_roi_indices
+
+            ''',
+            'T top_data',
+            '''
+            // pos in output filter
+            int ph = (i / pooled_width) % pooled_height;
+            int pw = i % pooled_width;
+            int ctop = (i / pooled_width / pooled_height) % pooled_dim;
+            int n = i / pooled_width / pooled_height / pooled_dim;
+
+            int roi_batch_ind = bottom_roi_indices[n];
+            T roi_start_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 0])) * spatial_scale;
+            T roi_start_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 1])) * spatial_scale;
+            T roi_end_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 2])) * spatial_scale;
+            T roi_end_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+
+            // Force too small ROIs to be 1x1
+            T roi_height = max(roi_end_h - roi_start_h, 0.1);
+            T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+
+            // Compute w and h at bottom
+            T bin_size_h = roi_height / static_cast<T>(pooled_height);
+            T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+            // Compute c at bottom
+            int gh = floor(
+                static_cast<T>(ph) * group_size / pooled_height);
+            int gw = floor(
+                static_cast<T>(pw) * group_size / pooled_width);
+            gh = min(max(gh, 0), group_size - 1);
+            gw = min(max(gw, 0), group_size - 1);
+            int c = (ctop * group_size + gh) * group_size + gw;
+
+            int bottom_data_offset =
+                (roi_batch_ind * channels + c) * height * width;
+
+            // We use roi_bin_grid to sample the grid and mimic integral
+            int roi_bin_grid_h = (sampling_ratio_h > 0)
+                ? sampling_ratio_h
+                : ceil(roi_height / pooled_height);  // e.g. = 2
+            int roi_bin_grid_w = (sampling_ratio_w > 0)
+                ? sampling_ratio_w
+                : ceil(roi_width / pooled_width);
+
+            T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+            T output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g. iy = 0, 1
+            {
+                T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+                for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                    T x = roi_start_w + pw * bin_size_w +
+                        static_cast<T>(ix + .5f) * bin_size_w /
+                            static_cast<T>(roi_bin_grid_w);
+
+                    // bilinear_interpolation {{
+                    int y_low, x_low, y_high, x_high;
+                    T w1, w2, w3, w4;
+                    bool ret = get_bilinear_interp_params(
+                        x, y, height, width,
+                        y_low, x_low, y_high, x_high,
+                        w1, w2, w3, w4
+                    );
+                    if (!ret) {
+                        continue;
+                    }
+
+                    T v1 = bottom_data[bottom_data_offset +
+                                           y_low * width + x_low];
+                    T v2 = bottom_data[bottom_data_offset +
+                                           y_low * width + x_high];
+                    T v3 = bottom_data[bottom_data_offset +
+                                           y_high * width + x_low];
+                    T v4 = bottom_data[bottom_data_offset +
+                                           y_high * width + x_high];
+                    // }}
+
+                    output_val += (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+                }
+            }
+            output_val /= count;
+
+            top_data = output_val;
+            ''',
+            'ps_roi_average_align_2d_fwd',
+            preamble=_GET_BILINEAR_INTERP_KERNEL,
+        )(bottom_data, self.spatial_scale, channels, height, width,
+          self.out_c, self.out_h, self.out_w, self.group_size,
+          sampling_ratio_h, sampling_ratio_w,
+          bottom_rois, bottom_roi_indices, top_data)
+
+        return top_data,
+
+    def backward_cpu(self, inputs, gy):
+        _, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = self._bottom_data_shape[1:]
+        bottom_diff = np.zeros(self._bottom_data_shape, np.float32)
+
+        spatial_scale = self.spatial_scale
+        pooled_dim = self.out_c
+        pooled_height = self.out_h
+        pooled_width = self.out_w
+        group_size = self.group_size
+        top_diff = gy[0]
+
+        for i in six.moves.range(top_diff.size):
+            pw = i % pooled_width
+            ph = int(i / pooled_width) % pooled_height
+            ctop = int(i / pooled_width / pooled_height) % pooled_dim
+            n = int(i / pooled_width / pooled_height / pooled_dim)
+
+            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_start_h = bottom_rois[n, 0] * spatial_scale
+            roi_start_w = bottom_rois[n, 1] * spatial_scale
+            roi_end_h = bottom_rois[n, 2] * spatial_scale
+            roi_end_w = bottom_rois[n, 3] * spatial_scale
+
+            roi_width = max(roi_end_w - roi_start_w, 1.)
+            roi_height = max(roi_end_h - roi_start_h, 1.)
+            bin_size_h = 1. * roi_height / pooled_height
+            bin_size_w = 1. * roi_width / pooled_width
+
+            gh = np.floor(float(ph) * group_size / pooled_height)
+            gw = np.floor(float(pw) * group_size / pooled_width)
+            gh = int(min(max(gh, 0), group_size - 1))
+            gw = int(min(max(gw, 0), group_size - 1))
+            c = (ctop * group_size + gh) * group_size + gw
+
+            top_diff_this_bin = top_diff[n, ctop, ph, pw]
+
+            if self.sampling_ratio[0] is None:
+                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+            else:
+                roi_bin_grid_h = self.sampling_ratio[0]
+            if self.sampling_ratio[1] is None:
+                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+            else:
+                roi_bin_grid_w = self.sampling_ratio[1]
+
+            count = roi_bin_grid_h * roi_bin_grid_w
+
+            iy = 0
+            while iy < roi_bin_grid_h:
+                y = roi_start_h + ph * bin_size_h + \
+                    (iy + .5) * bin_size_h / roi_bin_grid_h
+                ix = 0
+                while ix < roi_bin_grid_w:
+                    x = roi_start_w + pw * bin_size_w + \
+                        (ix + .5) * bin_size_w / roi_bin_grid_w
+
+                    # bilinear_interpolation_gradient {{
+                    y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
+                        _get_bilinear_interp_params(y, x, height, width)
+                    if y_low is None:
+                        continue
+
+                    g1 = top_diff_this_bin * w1 / count
+                    g2 = top_diff_this_bin * w2 / count
+                    g3 = top_diff_this_bin * w3 / count
+                    g4 = top_diff_this_bin * w4 / count
+
+                    if (x_low >= 0 and x_high >= 0 and
+                            y_low >= 0 and y_high >= 0):
+                        bottom_diff[roi_batch_ind, c, y_low, x_low] += g1
+                        bottom_diff[roi_batch_ind, c, y_low, x_high] += g2
+                        bottom_diff[roi_batch_ind, c, y_high, x_low] += g3
+                        bottom_diff[roi_batch_ind, c, y_high, x_high] += g4
+                    ix += 1
+                iy += 1
+
+        return bottom_diff, None, None
+
+    def backward_gpu(self, inputs, gy):
+        _, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = self._bottom_data_shape[1:]
+        bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, np.float32)
+
+        if self.sampling_ratio[0] is None:
+            sampling_ratio_h = 0
+        else:
+            sampling_ratio_h = self.sampling_ratio[0]
+        if self.sampling_ratio[1] is None:
+            sampling_ratio_w = 0
+        else:
+            sampling_ratio_w = self.sampling_ratio[1]
+        cuda.elementwise(
+            '''
+            raw T top_diff, T spatial_scale,
+            int32 channels, int32 height, int32 width,
+            int32 pooled_dim, int32 pooled_height, int32 pooled_width,
+            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w,
+            raw T bottom_rois, raw int32 bottom_roi_indices
+            ''',
+            'raw T bottom_diff',
+            '''
+            // (n, c, h, w) coords in bottom data
+            int pw = i % pooled_width;
+            int ph = (i / pooled_width) % pooled_height;
+            int ctop = (i / pooled_width / pooled_height) % pooled_dim;
+            int n = i / pooled_width / pooled_height / pooled_dim;
+
+            // Do not using rounding; this implementation detail is critical
+            int roi_batch_ind = bottom_roi_indices[n];
+            T roi_start_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 0])) * spatial_scale;
+            T roi_start_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 1])) * spatial_scale;
+            T roi_end_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 2])) * spatial_scale;
+            T roi_end_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+
+            // Force too small ROIs to be 1x1
+            T roi_height = max(roi_end_h - roi_start_h, 0.1);
+            T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+
+            // Compute w and h at bottom
+            T bin_size_h = roi_height / static_cast<T>(pooled_height);
+            T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+            // Compute c at bottom
+            int gh = floor(
+                static_cast<T>(ph) * group_size / pooled_height);
+            int gw = floor(
+                static_cast<T>(pw) * group_size / pooled_width);
+            gh = min(max(gh, 0), group_size - 1);
+            gw = min(max(gw, 0), group_size - 1);
+            int c = (ctop * group_size + gh) * group_size + gw;
+
+            int bottom_diff_offset =
+                (roi_batch_ind * channels + c) * height * width;
+
+            int top_offset =
+                (n * pooled_dim + ctop) * pooled_height * pooled_width;
+            T top_diff_this_bin =
+                top_diff[top_offset + ph * pooled_width + pw];
+
+            // We use roi_bin_grid to sample the grid and mimic integral
+            int roi_bin_grid_h = (sampling_ratio_h > 0)
+                ? sampling_ratio_h
+                : ceil(roi_height / pooled_height); // e.g. = 2
+            int roi_bin_grid_w = (sampling_ratio_w > 0)
+                ? sampling_ratio_w
+                : ceil(roi_width / pooled_width);
+
+            // We do average (integral) pooling inside a bin
+            T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+                T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+                for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                    T x = roi_start_w + pw * bin_size_w +
+                        static_cast<T>(ix + .5f) * bin_size_w /
+                            static_cast<T>(roi_bin_grid_w);
+
+                    // bilinear_interpolation_gradient {{
+                    int y_low, x_low, y_high, x_high;
+                    T w1, w2, w3, w4;
+                    bool ret = get_bilinear_interp_params(
+                        x, y, height, width,
+                        y_low, x_low, y_high, x_high,
+                        w1, w2, w3, w4
+                    );
+                    if (!ret) {
+                        continue;
+                    }
+
+                    T g1 = top_diff_this_bin * w1 / count;
+                    T g2 = top_diff_this_bin * w2 / count;
+                    T g3 = top_diff_this_bin * w3 / count;
+                    T g4 = top_diff_this_bin * w4 / count;
+
+                    if (x_low >= 0 && x_high >= 0 &&
+                            y_low >= 0 && y_high >= 0) {
+                        atomicAdd(&bottom_diff[bottom_diff_offset +
+                                               y_low * width + x_low], g1);
+                        atomicAdd(&bottom_diff[bottom_diff_offset +
+                                               y_low * width + x_high], g2);
+                        atomicAdd(&bottom_diff[bottom_diff_offset +
+                                               y_high * width + x_low], g3);
+                        atomicAdd(&bottom_diff[bottom_diff_offset +
+                                               y_high * width + x_high], g4);
+                    }
+                }
+            }
+            ''', 'ps_roi_average_align_2d_bwd',
+            preamble=_GET_BILINEAR_INTERP_KERNEL,
+        )(gy[0], self.spatial_scale, channels, height, width,
+          self.out_c, self.out_h, self.out_w,
+          self.group_size, sampling_ratio_h, sampling_ratio_w,
+          bottom_rois, bottom_roi_indices, bottom_diff, size=gy[0].size)
+
+        return bottom_diff, None, None
+
+
+def ps_roi_average_align_2d(
+        x, rois, roi_indices, out_c, out_h, out_w,
+        spatial_scale, group_size, sampling_ratio=None
+):
+    """Position Sensitive Region of Interest (ROI) Average align function.
+
+    This function computes position sensitive average of input spatial patch
+    with the given region of interests. Each ROI is splitted into
+    :math:`(group\_size, group\_size)` regions, and position sensitive values
+    in each region is computed.
+
+    Args:
+        x (~chainer.Variable): Input variable. The shape is expected to be
+            4 dimentional: (n: batch, c: channel, h, height, w: width).
+        rois (array): Input roi. The shape is expected to
+            be :math:`(R, 4)`, and each datum is set as below:
+            (y_min, x_min, y_max, x_max). The dtype is :obj:`numpy.float32`.
+        roi_indices (array): Input roi indices. The shape is expected to
+            be :math:`(R, )`. The dtype is :obj:`numpy.int32`.
+        out_c (int): Channels of output image after pooled.
+        out_h (int): Height of output image after pooled.
+        out_w (int): Width of output image after pooled.
+        spatial_scale (float): Scale of the roi is resized.
+        group_size (int): Position sensitive group size.
+        sampling_ratio ((int, int) or int): Sampling step for the alignment.
+            It must be an integer over :math:`1` or :obj:`None`, and the value
+            is automatically decided when :obj:`None` is passed.  Use of
+            different ratio in height and width axis is also supported by
+            passing tuple of int as ``(sampling_ratio_h, sampling_ratio_w)``.
+            ``sampling_ratio=s`` and ``sampling_ratio=(s, s)`` are equivalent.
+
+    Returns:
+        ~chainer.Variable: Output variable.
+
+    See the original paper proposing PSROIPooling:
+    `R-FCN <https://arxiv.org/abs/1605.06409>`_.
+
+    """
+    return PSROIAverageAlign2D(
+        out_c, out_h, out_w, spatial_scale,
+        group_size, sampling_ratio)(x, rois, roi_indices)
diff --git a/tests/functions_tests/test_ps_roi_average_align_2d.py b/tests/functions_tests/test_ps_roi_average_align_2d.py
new file mode 100644
index 0000000000..16425ad4f1
--- /dev/null
+++ b/tests/functions_tests/test_ps_roi_average_align_2d.py
@@ -0,0 +1,115 @@
+import chainer
+from chainer.backends import cuda
+from chainer import gradient_check
+from chainer import testing
+from chainer.testing import attr
+from chainer.testing import condition
+import numpy as np
+import unittest
+
+from chainercv import functions
+
+
+@testing.parameterize(*testing.product({
+    'sampling_ratio': [None, 1, 2, (None, 3), (1, 2)],
+}))
+class TestPSROIAveragePolling2D(unittest.TestCase):
+
+    def setUp(self):
+        self.N = 3
+        self.group_size = 2
+        self.out_c = 2
+        self.n_channels = self.group_size * self.group_size * self.out_c
+        self.x = np.arange(
+            self.N * self.n_channels * 10 * 12,
+            dtype=np.float32).reshape((self.N, self.n_channels, 10, 12))
+        np.random.shuffle(self.x)
+        self.x = 2 * self.x / self.x.size - 1
+        self.x = self.x.astype(np.float32)
+        self.rois = np.array(
+            [[0, 0, 7, 7],
+             [1, 0, 5, 12],
+             [0, 1, 10, 5],
+             [3, 3, 4, 4]],
+            dtype=np.float32
+        )
+        self.roi_indices = np.array([0, 2, 1, 0], dtype=np.int32)
+        self.n_roi = self.rois.shape[0]
+        self.out_h, self.out_w = 4, 4
+        self.spatial_scale = 1.0
+        self.gy = np.random.uniform(
+            -1, 1, (self.n_roi, self.out_c, self.out_h, self.out_w))
+        self.gy = self.gy.astype(np.float32)
+        self.check_backward_options = {'atol': 5e-4, 'rtol': 5e-3}
+
+    def check_forward(self, x_data, roi_data, roi_index_data):
+        x = chainer.Variable(x_data)
+        rois = chainer.Variable(roi_data)
+        roi_indices = chainer.Variable(roi_index_data)
+        y = functions.ps_roi_average_align_2d(
+            x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
+            self.spatial_scale, self.group_size,
+            sampling_ratio=self.sampling_ratio)
+        self.assertEqual(y.data.dtype, np.float32)
+        y_data = cuda.to_cpu(y.data)
+        self.assertEqual(
+            (self.n_roi, self.out_c, self.out_h, self.out_w), y_data.shape)
+
+    @condition.retry(3)
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.rois, self.roi_indices)
+
+    @attr.gpu
+    @condition.retry(3)
+    def test_forward_gpu(self):
+        self.check_forward(
+            cuda.to_gpu(self.x), cuda.to_gpu(self.rois),
+            cuda.to_gpu(self.roi_indices))
+
+    def check_backward(self, x_data, roi_data, roi_index_data, y_grad_data):
+        def f(x, rois, roi_indices):
+            return functions.ps_roi_average_align_2d(
+                x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
+                self.spatial_scale, self.group_size,
+                sampling_ratio=self.sampling_ratio)
+        gradient_check.check_backward(
+            f, (x_data, roi_data, roi_index_data), y_grad_data,
+            no_grads=[False, True, True], **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu(self):
+        self.check_backward(self.x, self.rois, self.roi_indices, self.gy)
+
+    @attr.gpu
+    @condition.retry(3)
+    def test_backward_gpu(self):
+        self.check_backward(
+            cuda.to_gpu(self.x), cuda.to_gpu(self.rois),
+            cuda.to_gpu(self.roi_indices), cuda.to_gpu(self.gy))
+
+    def apply_backward(self, x_data, roi_data, roi_index_data, y_grad_data):
+        x = chainer.Variable(x_data)
+        rois = chainer.Variable(roi_data)
+        roi_indices = chainer.Variable(roi_index_data)
+        y = functions.ps_roi_average_align_2d(
+            x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
+            self.spatial_scale, self.group_size,
+            sampling_ratio=self.sampling_ratio)
+        x.cleargrad()
+        y.grad = y_grad_data
+        y.backward()
+        return x, y
+
+    @attr.gpu
+    @condition.retry(3)
+    def test_consistency_with_gpu(self):
+        x_cpu, y_cpu = self.apply_backward(
+            self.x, self.rois, self.roi_indices, self.gy)
+        x_gpu, y_gpu = self.apply_backward(
+            cuda.to_gpu(self.x), cuda.to_gpu(self.rois),
+            cuda.to_gpu(self.roi_indices), cuda.to_gpu(self.gy))
+        testing.assert_allclose(y_cpu.data, y_gpu.data)
+        testing.assert_allclose(x_cpu.grad, x_gpu.grad)
+
+
+testing.run_module(__name__, __file__)

From 4f21814367e95f71af9f528039071d668a481986 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Sat, 9 Feb 2019 02:00:47 +0900
Subject: [PATCH 03/24] check input types in ps_roi_average_pooling_2d

---
 .../functions/ps_roi_average_pooling_2d.py     | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/chainercv/functions/ps_roi_average_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
index 08b2518927..6aace954cb 100644
--- a/chainercv/functions/ps_roi_average_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -50,6 +50,24 @@ def _roi_pooling_slice(size, stride, max_size, roi_offset):
 class PSROIAveragePooling2D(function.Function):
 
     def __init__(self, out_c, out_h, out_w, spatial_scale, group_size):
+        if not (isinstance(out_c, int) and out_c > 0):
+            raise TypeError(
+                'out_c must be positive integer: {}, {}'
+                .format(type(out_c), out_c))
+        if not (isinstance(out_h, int) and out_h > 0):
+            raise TypeError(
+                'out_h must be positive integer: {}, {}'
+                .format(type(out_h), out_h))
+        if not (isinstance(out_w, int) and out_w > 0):
+            raise TypeError(
+                'out_w must be positive integer: {}, {}'
+                .format(type(out_w), out_w))
+        if isinstance(spatial_scale, int):
+            spatial_scale = float(spatial_scale)
+        if not (isinstance(group_size, int) and group_size > 0):
+            raise TypeError(
+                'group_size must be positive integer: {}, {}'
+                .format(type(group_size), group_size))
         self.out_c, self.out_h, self.out_w = out_c, out_h, out_w
         self.spatial_scale = spatial_scale
         self.group_size = group_size

From 3bd04523d54dd8a81b5c22c3176953bd7b6fde12 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Sat, 9 Feb 2019 02:18:31 +0900
Subject: [PATCH 04/24] refactor ps_roi_average_align_2d.py

---
 .../functions/ps_roi_average_align_2d.py      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
index d3b29f3ebc..38e92def19 100644
--- a/chainercv/functions/ps_roi_average_align_2d.py
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -1,7 +1,7 @@
 # Modified work:
 # -----------------------------------------------------------------------------
-# Copyright (c) 2018 Preferred Infrastructure, Inc.
-# Copyright (c) 2018 Preferred Networks, Inc.
+# Copyright (c) 2019 Preferred Infrastructure, Inc.
+# Copyright (c) 2019 Preferred Networks, Inc.
 # -----------------------------------------------------------------------------
 
 # Original work:
@@ -268,6 +268,7 @@ def forward_gpu(self, inputs):
         n_roi = bottom_rois.shape[0]
         top_data = cuda.cupy.empty(
             (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
+
         if self.sampling_ratio[0] is None:
             sampling_ratio_h = 0
         else:
@@ -278,12 +279,10 @@ def forward_gpu(self, inputs):
             sampling_ratio_w = self.sampling_ratio[1]
         cuda.elementwise(
             '''
-            raw T bottom_data, T spatial_scale, int32 channels,
-            int32 height, int32 width,
+            raw T bottom_data, raw T bottom_rois, raw int32 bottom_roi_indices,
+            T spatial_scale, int32 channels, int32 height, int32 width,
             int32 pooled_dim, int32 pooled_height, int32 pooled_width,
-            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w,
-            raw T bottom_rois, raw int32 bottom_roi_indices
-
+            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w
             ''',
             'T top_data',
             '''
@@ -375,10 +374,10 @@ def forward_gpu(self, inputs):
             ''',
             'ps_roi_average_align_2d_fwd',
             preamble=_GET_BILINEAR_INTERP_KERNEL,
-        )(bottom_data, self.spatial_scale, channels, height, width,
+        )(bottom_data, bottom_rois, bottom_roi_indices,
+          self.spatial_scale, channels, height, width,
           self.out_c, self.out_h, self.out_w, self.group_size,
-          sampling_ratio_h, sampling_ratio_w,
-          bottom_rois, bottom_roi_indices, top_data)
+          sampling_ratio_h, sampling_ratio_w, top_data)
 
         return top_data,
 
@@ -476,11 +475,10 @@ def backward_gpu(self, inputs, gy):
             sampling_ratio_w = self.sampling_ratio[1]
         cuda.elementwise(
             '''
-            raw T top_diff, T spatial_scale,
-            int32 channels, int32 height, int32 width,
+            raw T top_diff, raw T bottom_rois, raw int32 bottom_roi_indices,
+            T spatial_scale, int32 channels, int32 height, int32 width,
             int32 pooled_dim, int32 pooled_height, int32 pooled_width,
-            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w,
-            raw T bottom_rois, raw int32 bottom_roi_indices
+            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w
             ''',
             'raw T bottom_diff',
             '''
@@ -546,7 +544,6 @@ def backward_gpu(self, inputs, gy):
                         static_cast<T>(ix + .5f) * bin_size_w /
                             static_cast<T>(roi_bin_grid_w);
 
-                    // bilinear_interpolation_gradient {{
                     int y_low, x_low, y_high, x_high;
                     T w1, w2, w3, w4;
                     bool ret = get_bilinear_interp_params(
@@ -578,10 +575,11 @@ def backward_gpu(self, inputs, gy):
             }
             ''', 'ps_roi_average_align_2d_bwd',
             preamble=_GET_BILINEAR_INTERP_KERNEL,
-        )(gy[0], self.spatial_scale, channels, height, width,
+        )(gy[0], bottom_rois, bottom_roi_indices,
+          self.spatial_scale, channels, height, width,
           self.out_c, self.out_h, self.out_w,
           self.group_size, sampling_ratio_h, sampling_ratio_w,
-          bottom_rois, bottom_roi_indices, bottom_diff, size=gy[0].size)
+          bottom_diff, size=gy[0].size)
 
         return bottom_diff, None, None
 
@@ -622,6 +620,8 @@ def ps_roi_average_align_2d(
 
     See the original paper proposing PSROIPooling:
     `R-FCN <https://arxiv.org/abs/1605.06409>`_.
+    See the original paper proposing ROIAlign:
+    `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_.
 
     """
     return PSROIAverageAlign2D(

From d8e0ad672d8e4623342299ec219c082d36201726 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Sat, 9 Feb 2019 02:21:39 +0900
Subject: [PATCH 05/24] fix typo in functions_tests

---
 tests/functions_tests/test_ps_roi_average_align_2d.py   | 2 +-
 tests/functions_tests/test_ps_roi_average_pooling_2d.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functions_tests/test_ps_roi_average_align_2d.py b/tests/functions_tests/test_ps_roi_average_align_2d.py
index 16425ad4f1..64cb0d0435 100644
--- a/tests/functions_tests/test_ps_roi_average_align_2d.py
+++ b/tests/functions_tests/test_ps_roi_average_align_2d.py
@@ -13,7 +13,7 @@
 @testing.parameterize(*testing.product({
     'sampling_ratio': [None, 1, 2, (None, 3), (1, 2)],
 }))
-class TestPSROIAveragePolling2D(unittest.TestCase):
+class TestPSROIAverageAlign2D(unittest.TestCase):
 
     def setUp(self):
         self.N = 3
diff --git a/tests/functions_tests/test_ps_roi_average_pooling_2d.py b/tests/functions_tests/test_ps_roi_average_pooling_2d.py
index 2809b9bf9d..1b1d74ced8 100644
--- a/tests/functions_tests/test_ps_roi_average_pooling_2d.py
+++ b/tests/functions_tests/test_ps_roi_average_pooling_2d.py
@@ -10,7 +10,7 @@
 from chainercv import functions
 
 
-class TestPSROIAveragePolling2D(unittest.TestCase):
+class TestPSROIAveragePooling2D(unittest.TestCase):
 
     def setUp(self):
         self.N = 3

From 6301201688c0cccacd3e7462565261fb577976a3 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Sat, 9 Feb 2019 02:21:54 +0900
Subject: [PATCH 06/24] add ps_roi_max_align_2d.py

---
 chainercv/functions/__init__.py               |   1 +
 chainercv/functions/ps_roi_max_align_2d.py    | 549 ++++++++++++++++++
 .../functions_tests/test_ps_roi_max_align.py  | 115 ++++
 3 files changed, 665 insertions(+)
 create mode 100644 chainercv/functions/ps_roi_max_align_2d.py
 create mode 100644 tests/functions_tests/test_ps_roi_max_align.py

diff --git a/chainercv/functions/__init__.py b/chainercv/functions/__init__.py
index a78f903a83..cf68d729e6 100644
--- a/chainercv/functions/__init__.py
+++ b/chainercv/functions/__init__.py
@@ -1,2 +1,3 @@
 from chainercv.functions.ps_roi_average_align_2d import ps_roi_average_align_2d  # NOQA
 from chainercv.functions.ps_roi_average_pooling_2d import ps_roi_average_pooling_2d  # NOQA
+from chainercv.functions.ps_roi_max_align_2d import ps_roi_max_align_2d  # NOQA
diff --git a/chainercv/functions/ps_roi_max_align_2d.py b/chainercv/functions/ps_roi_max_align_2d.py
new file mode 100644
index 0000000000..fd1a6284db
--- /dev/null
+++ b/chainercv/functions/ps_roi_max_align_2d.py
@@ -0,0 +1,549 @@
+# Modified work:
+# -----------------------------------------------------------------------------
+# Copyright (c) 2019 Preferred Infrastructure, Inc.
+# Copyright (c) 2019 Preferred Networks, Inc.
+# -----------------------------------------------------------------------------
+
+# Original work:
+# -----------------------------------------------------------------------------
+# Copyright (c) 2015 by Contributors
+# \file roi_pooling.cu
+# \brief roi pooling operator
+# \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
+# \changed to roi_align by Elaine Bao
+# \file roi_align.cu
+# \roi align operator described in Mask RCNN
+# -----------------------------------------------------------------------------
+
+from __future__ import division
+
+import numpy as np
+import six
+
+import chainer
+from chainer.backends import cuda
+from chainer import function
+from chainer.utils import type_check
+
+from chainercv.functions.ps_roi_average_align_2d \
+    import _GET_BILINEAR_INTERP_KERNEL
+from chainercv.functions.ps_roi_average_align_2d \
+    import _get_bilinear_interp_params
+
+
+def _pair(x):
+    if isinstance(x, chainer.utils.collections_abc.Iterable):
+        return x
+    return x, x
+
+
+class PSROIMaxAlign2D(function.Function):
+
+    def __init__(
+            self, out_c, out_h, out_w, spatial_scale,
+            group_size, sampling_ratio=None
+    ):
+        if not (isinstance(out_c, int) and out_c > 0):
+            raise TypeError(
+                'out_c must be positive integer: {}, {}'
+                .format(type(out_c), out_c))
+        if not (isinstance(out_h, int) and out_h > 0):
+            raise TypeError(
+                'out_h must be positive integer: {}, {}'
+                .format(type(out_h), out_h))
+        if not (isinstance(out_w, int) and out_w > 0):
+            raise TypeError(
+                'out_w must be positive integer: {}, {}'
+                .format(type(out_w), out_w))
+        if isinstance(spatial_scale, int):
+            spatial_scale = float(spatial_scale)
+        if not (isinstance(group_size, int) and group_size > 0):
+            raise TypeError(
+                'group_size must be positive integer: {}, {}'
+                .format(type(group_size), group_size))
+        if not (isinstance(spatial_scale, float) and spatial_scale > 0):
+            raise TypeError(
+                'spatial_scale must be a positive float number: {}, {}'
+                .format(type(spatial_scale), spatial_scale))
+        sampling_ratio = _pair(sampling_ratio)
+        if not all((isinstance(s, int) and s >= 1) or s is None
+                   for s in sampling_ratio):
+            raise TypeError(
+                'sampling_ratio must be integer >= 1 or a pair of it: {}'
+                .format(sampling_ratio))
+
+        self.out_c, self.out_h, self.out_w = out_c, out_h, out_w
+        self.spatial_scale = spatial_scale
+        self.group_size = group_size
+        self.sampling_ratio = sampling_ratio
+
+    def check_type_forward(self, in_types):
+        type_check.expect(in_types.size() == 3)
+
+        x_type, roi_type, roi_index_type = in_types
+        type_check.expect(
+            x_type.dtype == np.float32,
+            x_type.ndim == 4,
+            roi_type.dtype == np.float32,
+            roi_type.ndim == 2,
+            roi_type.shape[1] == 4,
+            roi_index_type.dtype == np.int32,
+            roi_index_type.ndim == 1,
+            roi_type.shape[0] == roi_index_type.shape[0]
+        )
+
+    def forward_cpu(self, inputs):
+        self.retain_inputs((1, 2))
+        self._bottom_data_shape = inputs[0].shape
+
+        bottom_data, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = bottom_data.shape[1:]
+        n_roi = bottom_rois.shape[0]
+        top_data = np.empty(
+            (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
+        self.argmax_data = np.empty(top_data.shape, dtype=np.int32)
+
+        group_size = self.group_size
+        pooled_dim, pooled_width, pooled_height \
+            = self.out_c, self.out_w, self.out_h
+        spatial_scale = self.spatial_scale
+
+        for i in six.moves.range(top_data.size):
+            pw = i % pooled_width
+            ph = int(i / pooled_width) % pooled_height
+            ctop = int(i / pooled_width / pooled_height) % pooled_dim
+            n = int(i / pooled_width / pooled_height / pooled_dim)
+
+            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_start_h = bottom_rois[n, 0] * spatial_scale
+            roi_start_w = bottom_rois[n, 1] * spatial_scale
+            roi_end_h = bottom_rois[n, 2] * spatial_scale
+            roi_end_w = bottom_rois[n, 3] * spatial_scale
+
+            roi_height = max(roi_end_h - roi_start_h, 1.)
+            roi_width = max(roi_end_w - roi_start_w, 1.)
+            bin_size_h = 1. * roi_height / pooled_height
+            bin_size_w = 1. * roi_width / pooled_width
+
+            gh = np.floor(float(ph) * group_size / pooled_height)
+            gw = np.floor(float(pw) * group_size / pooled_width)
+            gh = int(min(max(gh, 0), group_size - 1))
+            gw = int(min(max(gw, 0), group_size - 1))
+            c = (ctop * group_size + gh) * group_size + gw
+
+            if self.sampling_ratio[0] is None:
+                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+            else:
+                roi_bin_grid_h = self.sampling_ratio[0]
+            if self.sampling_ratio[1] is None:
+                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+            else:
+                roi_bin_grid_w = self.sampling_ratio[1]
+
+            maxval = -1e20
+            maxidx = -1
+            iy = 0
+            while iy < roi_bin_grid_h:
+                y = roi_start_h + ph * bin_size_h + \
+                    (iy + .5) * bin_size_h / roi_bin_grid_h
+                ix = 0
+                while ix < roi_bin_grid_w:
+                    x = roi_start_w + pw * bin_size_w + \
+                        (ix + .5) * bin_size_w / roi_bin_grid_w
+
+                    # bilinear interpolation {{
+                    y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
+                        _get_bilinear_interp_params(y, x, height, width)
+                    if y_low is None:
+                        continue
+
+                    v1 = bottom_data[roi_batch_ind, c, y_low, x_low]
+                    v2 = bottom_data[roi_batch_ind, c, y_low, x_high]
+                    v3 = bottom_data[roi_batch_ind, c, y_high, x_low]
+                    v4 = bottom_data[roi_batch_ind, c, y_high, x_high]
+
+                    tmpval = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+                    bottom_index = iy * roi_bin_grid_w + ix
+                    if (tmpval > maxval):
+                        maxval = tmpval
+                        maxidx = bottom_index
+
+                    ix += 1
+                iy += 1
+
+            top_data[n, ctop, ph, pw] = maxval
+            self.argmax_data[n, ctop, ph, pw] = maxidx
+
+        return top_data,
+
+    def forward_gpu(self, inputs):
+        self.retain_inputs((1, 2))
+        self._bottom_data_shape = inputs[0].shape
+
+        bottom_data, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = bottom_data.shape[1:]
+        n_roi = bottom_rois.shape[0]
+        top_data = cuda.cupy.empty(
+            (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
+        self.argmax_data = cuda.cupy.empty(top_data.shape, np.int32)
+
+        if self.sampling_ratio[0] is None:
+            sampling_ratio_h = 0
+        else:
+            sampling_ratio_h = self.sampling_ratio[0]
+        if self.sampling_ratio[1] is None:
+            sampling_ratio_w = 0
+        else:
+            sampling_ratio_w = self.sampling_ratio[1]
+        cuda.elementwise(
+            '''
+            raw T bottom_data, raw T bottom_rois,
+            raw int32 bottom_roi_indices,
+            T spatial_scale, int32 channels,
+            int32 height, int32 width,
+            int32 pooled_dim, int32 pooled_height, int32 pooled_width,
+            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w
+            ''',
+            'T top_data, int32 argmax_data',
+            '''
+            // pos in output filter
+            int ph = (i / pooled_width) % pooled_height;
+            int pw = i % pooled_width;
+            int ctop = (i / pooled_width / pooled_height) % pooled_dim;
+            int n = i / pooled_width / pooled_height / pooled_dim;
+
+            int roi_batch_ind = bottom_roi_indices[n];
+            T roi_start_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 0])) * spatial_scale;
+            T roi_start_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 1])) * spatial_scale;
+            T roi_end_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 2])) * spatial_scale;
+            T roi_end_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+
+            // Force too small ROIs to be 1x1
+            T roi_height = max(roi_end_h - roi_start_h, 0.1);
+            T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+
+            // Compute w and h at bottom
+            T bin_size_h = roi_height / static_cast<T>(pooled_height);
+            T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+            // Compute c at bottom
+            int gh = floor(
+                static_cast<T>(ph) * group_size / pooled_height);
+            int gw = floor(
+                static_cast<T>(pw) * group_size / pooled_width);
+            gh = min(max(gh, 0), group_size - 1);
+            gw = min(max(gw, 0), group_size - 1);
+            int c = (ctop * group_size + gh) * group_size + gw;
+
+            int bottom_data_offset =
+                (roi_batch_ind * channels + c) * height * width;
+
+            // We use roi_bin_grid to sample the grid and mimic integral
+            int roi_bin_grid_h = (sampling_ratio_h > 0)
+                ? sampling_ratio_h
+                : ceil(roi_height / pooled_height);  // e.g. = 2
+            int roi_bin_grid_w = (sampling_ratio_w > 0)
+                ? sampling_ratio_w
+                : ceil(roi_width / pooled_width);
+
+            T maxval = -1E+20;
+            int maxidx = -1;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g. iy = 0, 1
+            {
+                T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+                for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                    T x = roi_start_w + pw * bin_size_w +
+                        static_cast<T>(ix + .5f) * bin_size_w /
+                            static_cast<T>(roi_bin_grid_w);
+
+                    // bilinear_interpolation {{
+                    int y_low, x_low, y_high, x_high;
+                    T w1, w2, w3, w4;
+                    bool ret = get_bilinear_interp_params(
+                        x, y, height, width,
+                        y_low, x_low, y_high, x_high,
+                        w1, w2, w3, w4
+                    );
+                    if (!ret) {
+                        continue;
+                    }
+
+                    T v1 = bottom_data[bottom_data_offset +
+                                           y_low * width + x_low];
+                    T v2 = bottom_data[bottom_data_offset +
+                                           y_low * width + x_high];
+                    T v3 = bottom_data[bottom_data_offset +
+                                           y_high * width + x_low];
+                    T v4 = bottom_data[bottom_data_offset +
+                                           y_high * width + x_high];
+
+                    // }}
+
+                    T tmpval = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                    int bottom_index = iy * roi_bin_grid_w + ix;
+                    if (tmpval > maxval) {
+                        maxval = tmpval;
+                        maxidx =  bottom_index;
+                    }
+                }
+            }
+            top_data = maxval;
+            argmax_data = maxidx;
+            ''',
+            'ps_roi_max_align_2d_fwd',
+            preamble=_GET_BILINEAR_INTERP_KERNEL,
+        )(bottom_data, bottom_rois, bottom_roi_indices,
+          self.spatial_scale, channels, height, width,
+          self.out_c, self.out_h, self.out_w,
+          self.group_size, sampling_ratio_h, sampling_ratio_w,
+          top_data, self.argmax_data)
+
+        return top_data,
+
+    def backward_cpu(self, inputs, gy):
+        _, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = self._bottom_data_shape[1:]
+        bottom_diff = np.zeros(self._bottom_data_shape, np.float32)
+
+        spatial_scale = self.spatial_scale
+        pooled_dim = self.out_c
+        pooled_height = self.out_h
+        pooled_width = self.out_w
+        group_size = self.group_size
+        top_diff = gy[0]
+
+        for i in six.moves.range(top_diff.size):
+            pw = i % pooled_width
+            ph = int(i / pooled_width) % pooled_height
+            ctop = int(i / pooled_width / pooled_height) % pooled_dim
+            n = int(i / pooled_width / pooled_height / pooled_dim)
+
+            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_start_h = bottom_rois[n, 0] * spatial_scale
+            roi_start_w = bottom_rois[n, 1] * spatial_scale
+            roi_end_h = bottom_rois[n, 2] * spatial_scale
+            roi_end_w = bottom_rois[n, 3] * spatial_scale
+
+            roi_width = max(roi_end_w - roi_start_w, 1.)
+            roi_height = max(roi_end_h - roi_start_h, 1.)
+            bin_size_h = 1. * roi_height / pooled_height
+            bin_size_w = 1. * roi_width / pooled_width
+
+            gh = np.floor(float(ph) * group_size / pooled_height)
+            gw = np.floor(float(pw) * group_size / pooled_width)
+            gh = int(min(max(gh, 0), group_size - 1))
+            gw = int(min(max(gw, 0), group_size - 1))
+            c = (ctop * group_size + gh) * group_size + gw
+
+            top_diff_this_bin = top_diff[n, ctop, ph, pw]
+
+            if self.sampling_ratio[0] is None:
+                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+            else:
+                roi_bin_grid_h = self.sampling_ratio[0]
+            if self.sampling_ratio[1] is None:
+                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+            else:
+                roi_bin_grid_w = self.sampling_ratio[1]
+
+            maxidx = self.argmax_data[n, ctop, ph, pw]
+            iy = int(maxidx / roi_bin_grid_w)
+            ix = maxidx % roi_bin_grid_w
+
+            y = roi_start_h + ph * bin_size_h + \
+                (iy + .5) * bin_size_h / roi_bin_grid_h
+            x = roi_start_w + pw * bin_size_w + \
+                (ix + .5) * bin_size_w / roi_bin_grid_w
+
+            # bilinear_interpolation_gradient {{
+            y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
+                _get_bilinear_interp_params(y, x, height, width)
+            if y_low is None:
+                continue
+
+            g1 = top_diff_this_bin * w1
+            g2 = top_diff_this_bin * w2
+            g3 = top_diff_this_bin * w3
+            g4 = top_diff_this_bin * w4
+
+            if (x_low >= 0 and x_high >= 0 and
+                    y_low >= 0 and y_high >= 0):
+                bottom_diff[roi_batch_ind, c, y_low, x_low] += g1
+                bottom_diff[roi_batch_ind, c, y_low, x_high] += g2
+                bottom_diff[roi_batch_ind, c, y_high, x_low] += g3
+                bottom_diff[roi_batch_ind, c, y_high, x_high] += g4
+
+        return bottom_diff, None, None
+
+    def backward_gpu(self, inputs, gy):
+        _, bottom_rois, bottom_roi_indices = inputs
+        channels, height, width = self._bottom_data_shape[1:]
+        bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, np.float32)
+
+        if self.sampling_ratio[0] is None:
+            sampling_ratio_h = 0
+        else:
+            sampling_ratio_h = self.sampling_ratio[0]
+        if self.sampling_ratio[1] is None:
+            sampling_ratio_w = 0
+        else:
+            sampling_ratio_w = self.sampling_ratio[1]
+        cuda.elementwise(
+            '''
+            raw T top_diff, raw int32 argmax_data,
+            raw T bottom_rois, raw int32 bottom_roi_indices,
+            T spatial_scale, int32 channels, int32 height, int32 width,
+            int32 pooled_dim, int32 pooled_height, int32 pooled_width,
+            int32 group_size, int32 sampling_ratio_h, int32 sampling_ratio_w
+            ''',
+            'raw T bottom_diff',
+            '''
+            // (n, c, h, w) coords in bottom data
+            int pw = i % pooled_width;
+            int ph = (i / pooled_width) % pooled_height;
+            int ctop = (i / pooled_width / pooled_height) % pooled_dim;
+            int n = i / pooled_width / pooled_height / pooled_dim;
+
+            // Do not using rounding; this implementation detail is critical
+            int roi_batch_ind = bottom_roi_indices[n];
+            T roi_start_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 0])) * spatial_scale;
+            T roi_start_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 1])) * spatial_scale;
+            T roi_end_h = static_cast<T>(
+                round(bottom_rois[n * 4 + 2])) * spatial_scale;
+            T roi_end_w = static_cast<T>(
+                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+
+            // Force too small ROIs to be 1x1
+            T roi_height = max(roi_end_h - roi_start_h, 0.1);
+            T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+
+            // Compute w and h at bottom
+            T bin_size_h = roi_height / static_cast<T>(pooled_height);
+            T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+            // Compute c at bottom
+            int gh = floor(
+                static_cast<T>(ph) * group_size / pooled_height);
+            int gw = floor(
+                static_cast<T>(pw) * group_size / pooled_width);
+            gh = min(max(gh, 0), group_size - 1);
+            gw = min(max(gw, 0), group_size - 1);
+            int c = (ctop * group_size + gh) * group_size + gw;
+
+            int bottom_diff_offset =
+                (roi_batch_ind * channels + c) * height * width;
+
+            int top_offset =
+                (n * pooled_dim + ctop) * pooled_height * pooled_width;
+            T top_diff_this_bin =
+                top_diff[top_offset + ph * pooled_width + pw];
+
+            // We use roi_bin_grid to sample the grid and mimic integral
+            int roi_bin_grid_h = (sampling_ratio_h > 0)
+                ? sampling_ratio_h
+                : ceil(roi_height / pooled_height); // e.g. = 2
+            int roi_bin_grid_w = (sampling_ratio_w > 0)
+                ? sampling_ratio_w
+                : ceil(roi_width / pooled_width);
+
+            int maxidx = argmax_data[top_offset + ph * pooled_width + pw];
+            int iy = maxidx / roi_bin_grid_w;
+            int ix = maxidx % roi_bin_grid_w;
+
+            T y = roi_start_h + ph * bin_size_h +
+                static_cast<T>(iy + .5f) * bin_size_h /
+                    static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+            T x = roi_start_w + pw * bin_size_w +
+                static_cast<T>(ix + .5f) * bin_size_w /
+                    static_cast<T>(roi_bin_grid_w);
+
+            int y_low, x_low, y_high, x_high;
+            T w1, w2, w3, w4;
+            bool ret = get_bilinear_interp_params(
+                x, y, height, width,
+                y_low, x_low, y_high, x_high,
+                w1, w2, w3, w4
+            );
+            if (!ret) {
+                continue;
+            }
+
+            T g1 = top_diff_this_bin * w1;
+            T g2 = top_diff_this_bin * w2;
+            T g3 = top_diff_this_bin * w3;
+            T g4 = top_diff_this_bin * w4;
+
+            if (x_low >= 0 && x_high >= 0 &&
+                    y_low >= 0 && y_high >= 0) {
+                atomicAdd(&bottom_diff[bottom_diff_offset +
+                                       y_low * width + x_low], g1);
+                atomicAdd(&bottom_diff[bottom_diff_offset +
+                                       y_low * width + x_high], g2);
+                atomicAdd(&bottom_diff[bottom_diff_offset +
+                                       y_high * width + x_low], g3);
+                atomicAdd(&bottom_diff[bottom_diff_offset +
+                                       y_high * width + x_high], g4);
+            }
+            ''',
+            'ps_roi_max_align_2d_bwd',
+            preamble=_GET_BILINEAR_INTERP_KERNEL,
+        )(gy[0], self.argmax_data, bottom_rois, bottom_roi_indices,
+          self.spatial_scale, channels, height, width,
+          self.out_c, self.out_h, self.out_w,
+          self.group_size, sampling_ratio_h, sampling_ratio_w,
+          bottom_diff, size=gy[0].size)
+
+        return bottom_diff, None, None
+
+
+def ps_roi_max_align_2d(
+        x, rois, roi_indices, out_c, out_h, out_w,
+        spatial_scale, group_size, sampling_ratio=None
+):
+    """Position Sensitive Region of Interest (ROI) Max align function.
+
+    This function computes position sensitive max value of input spatial patch
+    with the given region of interests. Each ROI is splitted into
+    :math:`(group\_size, group\_size)` regions, and position sensitive values
+    in each region is computed.
+
+    Args:
+        x (~chainer.Variable): Input variable. The shape is expected to be
+            4 dimentional: (n: batch, c: channel, h, height, w: width).
+        rois (array): Input roi. The shape is expected to
+            be :math:`(R, 4)`, and each datum is set as below:
+            (y_min, x_min, y_max, x_max). The dtype is :obj:`numpy.float32`.
+        roi_indices (array): Input roi indices. The shape is expected to
+            be :math:`(R, )`. The dtype is :obj:`numpy.int32`.
+        out_c (int): Channels of output image after pooled.
+        out_h (int): Height of output image after pooled.
+        out_w (int): Width of output image after pooled.
+        spatial_scale (float): Scale of the roi is resized.
+        group_size (int): Position sensitive group size.
+        sampling_ratio ((int, int) or int): Sampling step for the alignment.
+            It must be an integer over :math:`1` or :obj:`None`, and the value
+            is automatically decided when :obj:`None` is passed.  Use of
+            different ratio in height and width axis is also supported by
+            passing tuple of int as ``(sampling_ratio_h, sampling_ratio_w)``.
+            ``sampling_ratio=s`` and ``sampling_ratio=(s, s)`` are equivalent.
+
+    Returns:
+        ~chainer.Variable: Output variable.
+
+    See the original paper proposing PSROIPooling:
+    `R-FCN <https://arxiv.org/abs/1605.06409>`_.
+    See the original paper proposing ROIAlign:
+    `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_.
+
+    """
+    return PSROIMaxAlign2D(
+        out_c, out_h, out_w, spatial_scale,
+        group_size, sampling_ratio)(x, rois, roi_indices)
diff --git a/tests/functions_tests/test_ps_roi_max_align.py b/tests/functions_tests/test_ps_roi_max_align.py
new file mode 100644
index 0000000000..a48fe4661c
--- /dev/null
+++ b/tests/functions_tests/test_ps_roi_max_align.py
@@ -0,0 +1,115 @@
+import chainer
+from chainer.backends import cuda
+from chainer import gradient_check
+from chainer import testing
+from chainer.testing import attr
+from chainer.testing import condition
+import numpy as np
+import unittest
+
+from chainercv import functions
+
+
+@testing.parameterize(*testing.product({
+    'sampling_ratio': [None, 1, 2, (None, 3), (1, 2)],
+}))
+class TestPSROIMaxAlign2D(unittest.TestCase):
+
+    def setUp(self):
+        self.N = 3
+        self.group_size = 2
+        self.out_c = 2
+        self.n_channels = self.group_size * self.group_size * self.out_c
+        self.x = np.arange(
+            self.N * self.n_channels * 10 * 12,
+            dtype=np.float32).reshape((self.N, self.n_channels, 10, 12))
+        np.random.shuffle(self.x)
+        self.x = 2 * self.x / self.x.size - 1
+        self.x = self.x.astype(np.float32)
+        self.rois = np.array(
+            [[0, 0, 7, 7],
+             [1, 0, 5, 12],
+             [0, 1, 10, 5],
+             [3, 3, 4, 4]],
+            dtype=np.float32
+        )
+        self.roi_indices = np.array([0, 2, 1, 0], dtype=np.int32)
+        self.n_roi = self.rois.shape[0]
+        self.out_h, self.out_w = 4, 4
+        self.spatial_scale = 1.0
+        self.gy = np.random.uniform(
+            -1, 1, (self.n_roi, self.out_c, self.out_h, self.out_w))
+        self.gy = self.gy.astype(np.float32)
+        self.check_backward_options = {'atol': 5e-4, 'rtol': 5e-3}
+
+    def check_forward(self, x_data, roi_data, roi_index_data):
+        x = chainer.Variable(x_data)
+        rois = chainer.Variable(roi_data)
+        roi_indices = chainer.Variable(roi_index_data)
+        y = functions.ps_roi_max_align_2d(
+            x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
+            self.spatial_scale, self.group_size,
+            sampling_ratio=self.sampling_ratio)
+        self.assertEqual(y.data.dtype, np.float32)
+        y_data = cuda.to_cpu(y.data)
+        self.assertEqual(
+            (self.n_roi, self.out_c, self.out_h, self.out_w), y_data.shape)
+
+    @condition.retry(3)
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.rois, self.roi_indices)
+
+    @attr.gpu
+    @condition.retry(3)
+    def test_forward_gpu(self):
+        self.check_forward(
+            cuda.to_gpu(self.x), cuda.to_gpu(self.rois),
+            cuda.to_gpu(self.roi_indices))
+
+    def check_backward(self, x_data, roi_data, roi_index_data, y_grad_data):
+        def f(x, rois, roi_indices):
+            return functions.ps_roi_max_align_2d(
+                x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
+                self.spatial_scale, self.group_size,
+                sampling_ratio=self.sampling_ratio)
+        gradient_check.check_backward(
+            f, (x_data, roi_data, roi_index_data), y_grad_data,
+            no_grads=[False, True, True], **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu(self):
+        self.check_backward(self.x, self.rois, self.roi_indices, self.gy)
+
+    @attr.gpu
+    @condition.retry(3)
+    def test_backward_gpu(self):
+        self.check_backward(
+            cuda.to_gpu(self.x), cuda.to_gpu(self.rois),
+            cuda.to_gpu(self.roi_indices), cuda.to_gpu(self.gy))
+
+    def apply_backward(self, x_data, roi_data, roi_index_data, y_grad_data):
+        x = chainer.Variable(x_data)
+        rois = chainer.Variable(roi_data)
+        roi_indices = chainer.Variable(roi_index_data)
+        y = functions.ps_roi_max_align_2d(
+            x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
+            self.spatial_scale, self.group_size,
+            sampling_ratio=self.sampling_ratio)
+        x.cleargrad()
+        y.grad = y_grad_data
+        y.backward()
+        return x, y
+
+    @attr.gpu
+    @condition.retry(3)
+    def test_consistency_with_gpu(self):
+        x_cpu, y_cpu = self.apply_backward(
+            self.x, self.rois, self.roi_indices, self.gy)
+        x_gpu, y_gpu = self.apply_backward(
+            cuda.to_gpu(self.x), cuda.to_gpu(self.rois),
+            cuda.to_gpu(self.roi_indices), cuda.to_gpu(self.gy))
+        testing.assert_allclose(y_cpu.data, y_gpu.data)
+        testing.assert_allclose(x_cpu.grad, x_gpu.grad)
+
+
+testing.run_module(__name__, __file__)

From 127bd1206d6751c31914e8265e3cfe26e27fe518 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Sat, 9 Feb 2019 02:23:58 +0900
Subject: [PATCH 07/24] remove unnecessary import

---
 chainercv/functions/ps_roi_average_pooling_2d.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
index 6aace954cb..5c78b4b99d 100644
--- a/chainercv/functions/ps_roi_average_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -33,9 +33,6 @@
 from chainer import function
 from chainer.utils import type_check
 
-if cuda.available:
-    import cupy as cp
-
 
 def _roi_pooling_slice(size, stride, max_size, roi_offset):
     start = int(np.floor(size * stride))
@@ -142,7 +139,7 @@ def forward_gpu(self, inputs):
         bottom_data, bottom_rois, bottom_roi_indices = inputs
         channels, height, width = bottom_data.shape[1:]
         n_roi = bottom_rois.shape[0]
-        top_data = cp.empty(
+        top_data = cuda.cupy.empty(
             (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
         cuda.elementwise(
             '''

From 526f014729e5cab7a9906a29432fe75be0511bd6 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 13:21:43 +0900
Subject: [PATCH 08/24] update ps_roi_average_align_2d

---
 .../functions/ps_roi_average_align_2d.py      | 172 ++++++++----------
 1 file changed, 73 insertions(+), 99 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
index 38e92def19..78fb632b27 100644
--- a/chainercv/functions/ps_roi_average_align_2d.py
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -32,92 +32,66 @@ def _pair(x):
     return x, x
 
 
-def _get_bilinear_interp_params(y, x, height, width):
-    if y < -1 or y > height or x < -1 or x > width:
+def _get_bounds(p, limit):
+    if p < -1 or p > limit:
         # out of range, so it is empty
-        return (None,) * 8
-
-    if y <= 0:
-        y = 0
-    if x <= 0:
-        x = 0
-
-    y_low = int(y)
-    x_low = int(x)
-
-    if y_low >= height - 1:
-        y_high = y_low = height - 1
-        y = float(y_low)
+        return None, None, None
+    if p <= 0:
+        p = 0
+    low = int(np.floor(p))
+    if low >= limit - 1:
+        high = low = limit - 1
+        p = float(low)
     else:
-        y_high = y_low + 1
+        high = low + 1
+    return p, low, high
 
-    if x_low >= width - 1:
-        x_high = x_low = width - 1
-        x = float(x_low)
-    else:
-        x_high = x_low + 1
 
+def _get_bilinear_interp_params(y, x, y_low, x_low, y_high, x_high):
     ly = y - y_low
     lx = x - x_low
     hy = 1. - ly
     hx = 1. - lx
-
     w1 = hy * hx
     w2 = hy * lx
     w3 = ly * hx
     w4 = ly * lx
-
-    return y_low, x_low, y_high, x_high, w1, w2, w3, w4
+    return w1, w2, w3, w4
 
 
 _GET_BILINEAR_INTERP_KERNEL = '''
 __device__
-bool get_bilinear_interp_params(
-    T x, T y, const int height, const int width,
-    int &y_low, int &x_low, int &y_high, int &x_high,
-    T &w1, T &w2, T &w3, T &w4) {
-    // deal with cases that inverse elements are
-    // out of feature map boundary
-    if (y < -1. || y > height || x < -1. || x > width) {
+bool get_bounds(
+    T &p, const int limit, int &low, int &high) {
+    if (p < -1. || p > limit) {
         // empty
         return false;
     }
-
-    if (y <= 0) {
-        y = 0;
+    if (p <= 0) {
+        p = 0;
     }
-    if (x <= 0) {
-        x = 0;
-    }
-
-    y_low = (int)y;
-    x_low = (int)x;
-
-    if (y_low >= height - 1) {
-        y_high = y_low = height - 1;
-        y = (T)y_low;
+    low = (int)p;
+    if (low >= limit - 1) {
+        high = low = limit - 1;
+        p = (T)low;
     } else {
-        y_high = y_low + 1;
-    }
-
-    if (x_low >= width - 1) {
-        x_high = x_low = width - 1;
-        x = (T)x_low;
-    } else {
-        x_high = x_low + 1;
+        high = low + 1;
     }
+    return true;
+}
 
+__device__
+void get_bilinear_interp_params(
+    T y, T x, int y_low, int x_low, int y_high, int x_high,
+    T &w1, T &w2, T &w3, T &w4) {
     T ly = y - y_low;
     T lx = x - x_low;
     T hy = 1. - ly;
     T hx = 1. - lx;
-
     w1 = hy * hx;
     w2 = hy * lx;
     w3 = ly * hx;
     w4 = ly * lx;
-
-    return true;
 }
 '''
 
@@ -216,32 +190,35 @@ def forward_cpu(self, inputs):
             c = (ctop * group_size + gh) * group_size + gw
 
             if self.sampling_ratio[0] is None:
-                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+                roi_bin_grid_h = int(np.ceil(roi_height / pooled_height))
             else:
                 roi_bin_grid_h = self.sampling_ratio[0]
             if self.sampling_ratio[1] is None:
-                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+                roi_bin_grid_w = int(np.ceil(roi_width / pooled_width))
             else:
                 roi_bin_grid_w = self.sampling_ratio[1]
 
             count = roi_bin_grid_h * roi_bin_grid_w
 
             output_val = 0.
-            iy = 0
-            while iy < roi_bin_grid_h:
+            for iy in six.moves.range(roi_bin_grid_h):
                 y = roi_start_h + ph * bin_size_h + \
                     (iy + .5) * bin_size_h / roi_bin_grid_h
-                ix = 0
-                while ix < roi_bin_grid_w:
+                y, y_low, y_high = _get_bounds(y, height)
+                if y is None or y_low is None or y_high is None:
+                    continue
+                for ix in six.moves.range(roi_bin_grid_w):
                     x = roi_start_w + pw * bin_size_w + \
                         (ix + .5) * bin_size_w / roi_bin_grid_w
 
-                    # bilinear interpolation {{
-                    y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
-                        _get_bilinear_interp_params(y, x, height, width)
-                    if y_low is None:
+                    x, x_low, x_high = _get_bounds(x, width)
+                    if x is None or x_low is None or x_high is None:
                         continue
 
+                    # bilinear interpolation {{
+                    w1, w2, w3, w4 = _get_bilinear_interp_params(
+                        y, x, y_low, x_low, y_high, x_high)
+
                     v1 = bottom_data[roi_batch_ind, c, y_low, x_low]
                     v2 = bottom_data[roi_batch_ind, c, y_low, x_high]
                     v3 = bottom_data[roi_batch_ind, c, y_high, x_low]
@@ -251,9 +228,6 @@ def forward_cpu(self, inputs):
 
                     # }}
 
-                    ix += 1
-                iy += 1
-
             output_val /= count
             top_data[n, ctop, ph, pw] = output_val
 
@@ -338,22 +312,21 @@ def forward_gpu(self, inputs):
                 T y = roi_start_h + ph * bin_size_h +
                     static_cast<T>(iy + .5f) * bin_size_h /
                         static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+                int y_low, y_high;
+                bool y_ret = get_bounds(y, height, y_low, y_high);
+                if (!y_ret) continue;
                 for (int ix = 0; ix < roi_bin_grid_w; ix++) {
                     T x = roi_start_w + pw * bin_size_w +
                         static_cast<T>(ix + .5f) * bin_size_w /
                             static_cast<T>(roi_bin_grid_w);
 
-                    // bilinear_interpolation {{
-                    int y_low, x_low, y_high, x_high;
+                    int x_low, x_high;
+                    bool x_ret = get_bounds(x, width, x_low, x_high);
+                    if (!x_ret) continue;
+                    // bilinear_interpolation_gradient {{
                     T w1, w2, w3, w4;
-                    bool ret = get_bilinear_interp_params(
-                        x, y, height, width,
-                        y_low, x_low, y_high, x_high,
-                        w1, w2, w3, w4
-                    );
-                    if (!ret) {
-                        continue;
-                    }
+                    get_bilinear_interp_params(
+                        y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);
 
                     T v1 = bottom_data[bottom_data_offset +
                                            y_low * width + x_low];
@@ -419,30 +392,32 @@ def backward_cpu(self, inputs, gy):
             top_diff_this_bin = top_diff[n, ctop, ph, pw]
 
             if self.sampling_ratio[0] is None:
-                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+                roi_bin_grid_h = int(np.ceil(roi_height / pooled_height))
             else:
                 roi_bin_grid_h = self.sampling_ratio[0]
             if self.sampling_ratio[1] is None:
-                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+                roi_bin_grid_w = int(np.ceil(roi_width / pooled_width))
             else:
                 roi_bin_grid_w = self.sampling_ratio[1]
 
             count = roi_bin_grid_h * roi_bin_grid_w
 
-            iy = 0
-            while iy < roi_bin_grid_h:
+            for iy in six.moves.range(roi_bin_grid_h):
                 y = roi_start_h + ph * bin_size_h + \
                     (iy + .5) * bin_size_h / roi_bin_grid_h
-                ix = 0
-                while ix < roi_bin_grid_w:
+                y, y_low, y_high = _get_bounds(y, height)
+                if y is None or y_low is None or y_high is None:
+                    continue
+                for ix in six.moves.range(roi_bin_grid_w):
                     x = roi_start_w + pw * bin_size_w + \
                         (ix + .5) * bin_size_w / roi_bin_grid_w
 
-                    # bilinear_interpolation_gradient {{
-                    y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
-                        _get_bilinear_interp_params(y, x, height, width)
-                    if y_low is None:
+                    x, x_low, x_high = _get_bounds(x, width)
+                    if x is None or x_low is None or x_high is None:
                         continue
+                    # bilinear_interpolation_gradient {{
+                    w1, w2, w3, w4 = _get_bilinear_interp_params(
+                        y, x, y_low, x_low, y_high, x_high)
 
                     g1 = top_diff_this_bin * w1 / count
                     g2 = top_diff_this_bin * w2 / count
@@ -455,8 +430,7 @@ def backward_cpu(self, inputs, gy):
                         bottom_diff[roi_batch_ind, c, y_low, x_high] += g2
                         bottom_diff[roi_batch_ind, c, y_high, x_low] += g3
                         bottom_diff[roi_batch_ind, c, y_high, x_high] += g4
-                    ix += 1
-                iy += 1
+                    # }}
 
         return bottom_diff, None, None
 
@@ -539,21 +513,21 @@ def backward_gpu(self, inputs, gy):
                 T y = roi_start_h + ph * bin_size_h +
                     static_cast<T>(iy + .5f) * bin_size_h /
                         static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+                int y_low, y_high;
+                bool y_ret = get_bounds(y, height, y_low, y_high);
+                if (!y_ret) continue;
                 for (int ix = 0; ix < roi_bin_grid_w; ix++) {
                     T x = roi_start_w + pw * bin_size_w +
                         static_cast<T>(ix + .5f) * bin_size_w /
                             static_cast<T>(roi_bin_grid_w);
 
-                    int y_low, x_low, y_high, x_high;
+                    int x_low, x_high;
+                    bool x_ret = get_bounds(x, width, x_low, x_high);
+                    if (!x_ret) continue;
+                    // bilinear_interpolation_gradient {{
                     T w1, w2, w3, w4;
-                    bool ret = get_bilinear_interp_params(
-                        x, y, height, width,
-                        y_low, x_low, y_high, x_high,
-                        w1, w2, w3, w4
-                    );
-                    if (!ret) {
-                        continue;
-                    }
+                    get_bilinear_interp_params(
+                        y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);
 
                     T g1 = top_diff_this_bin * w1 / count;
                     T g2 = top_diff_this_bin * w2 / count;

From 40c0a236936888d2ed7cef0cd600391031c8bf28 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 13:30:52 +0900
Subject: [PATCH 09/24] update ps_roi_max_align_2d.py

---
 chainercv/functions/ps_roi_max_align_2d.py | 80 ++++++++++++----------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/chainercv/functions/ps_roi_max_align_2d.py b/chainercv/functions/ps_roi_max_align_2d.py
index fd1a6284db..b056297904 100644
--- a/chainercv/functions/ps_roi_max_align_2d.py
+++ b/chainercv/functions/ps_roi_max_align_2d.py
@@ -29,6 +29,7 @@
     import _GET_BILINEAR_INTERP_KERNEL
 from chainercv.functions.ps_roi_average_align_2d \
     import _get_bilinear_interp_params
+from chainercv.functions.ps_roi_average_align_2d import _get_bounds
 
 
 def _pair(x):
@@ -132,31 +133,34 @@ def forward_cpu(self, inputs):
             c = (ctop * group_size + gh) * group_size + gw
 
             if self.sampling_ratio[0] is None:
-                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+                roi_bin_grid_h = int(np.ceil(roi_height / pooled_height))
             else:
                 roi_bin_grid_h = self.sampling_ratio[0]
             if self.sampling_ratio[1] is None:
-                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+                roi_bin_grid_w = int(np.ceil(roi_width / pooled_width))
             else:
                 roi_bin_grid_w = self.sampling_ratio[1]
 
             maxval = -1e20
             maxidx = -1
-            iy = 0
-            while iy < roi_bin_grid_h:
+            for iy in six.moves.range(roi_bin_grid_h):
                 y = roi_start_h + ph * bin_size_h + \
                     (iy + .5) * bin_size_h / roi_bin_grid_h
-                ix = 0
-                while ix < roi_bin_grid_w:
+                y, y_low, y_high = _get_bounds(y, height)
+                if y is None or y_low is None or y_high is None:
+                    continue
+                for ix in six.moves.range(roi_bin_grid_w):
                     x = roi_start_w + pw * bin_size_w + \
                         (ix + .5) * bin_size_w / roi_bin_grid_w
 
-                    # bilinear interpolation {{
-                    y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
-                        _get_bilinear_interp_params(y, x, height, width)
-                    if y_low is None:
+                    x, x_low, x_high = _get_bounds(x, width)
+                    if x is None or x_low is None or x_high is None:
                         continue
 
+                    # bilinear interpolation {{
+                    w1, w2, w3, w4 = _get_bilinear_interp_params(
+                        y, x, y_low, x_low, y_high, x_high)
+
                     v1 = bottom_data[roi_batch_ind, c, y_low, x_low]
                     v2 = bottom_data[roi_batch_ind, c, y_low, x_high]
                     v3 = bottom_data[roi_batch_ind, c, y_high, x_low]
@@ -168,8 +172,7 @@ def forward_cpu(self, inputs):
                         maxval = tmpval
                         maxidx = bottom_index
 
-                    ix += 1
-                iy += 1
+                    # }}
 
             top_data[n, ctop, ph, pw] = maxval
             self.argmax_data[n, ctop, ph, pw] = maxidx
@@ -257,22 +260,21 @@ def forward_gpu(self, inputs):
                 T y = roi_start_h + ph * bin_size_h +
                     static_cast<T>(iy + .5f) * bin_size_h /
                         static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+                int y_low, y_high;
+                bool y_ret = get_bounds(y, height, y_low, y_high);
+                if (!y_ret) continue;
                 for (int ix = 0; ix < roi_bin_grid_w; ix++) {
                     T x = roi_start_w + pw * bin_size_w +
                         static_cast<T>(ix + .5f) * bin_size_w /
                             static_cast<T>(roi_bin_grid_w);
 
+                    int x_low, x_high;
+                    bool x_ret = get_bounds(x, width, x_low, x_high);
+                    if (!x_ret) continue;
                     // bilinear_interpolation {{
-                    int y_low, x_low, y_high, x_high;
                     T w1, w2, w3, w4;
-                    bool ret = get_bilinear_interp_params(
-                        x, y, height, width,
-                        y_low, x_low, y_high, x_high,
-                        w1, w2, w3, w4
-                    );
-                    if (!ret) {
-                        continue;
-                    }
+                    get_bilinear_interp_params(
+                        y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);
 
                     T v1 = bottom_data[bottom_data_offset +
                                            y_low * width + x_low];
@@ -344,11 +346,11 @@ def backward_cpu(self, inputs, gy):
             top_diff_this_bin = top_diff[n, ctop, ph, pw]
 
             if self.sampling_ratio[0] is None:
-                roi_bin_grid_h = np.ceil(roi_height / pooled_height)
+                roi_bin_grid_h = int(np.ceil(roi_height / pooled_height))
             else:
                 roi_bin_grid_h = self.sampling_ratio[0]
             if self.sampling_ratio[1] is None:
-                roi_bin_grid_w = np.ceil(roi_width / pooled_width)
+                roi_bin_grid_w = int(np.ceil(roi_width / pooled_width))
             else:
                 roi_bin_grid_w = self.sampling_ratio[1]
 
@@ -361,11 +363,16 @@ def backward_cpu(self, inputs, gy):
             x = roi_start_w + pw * bin_size_w + \
                 (ix + .5) * bin_size_w / roi_bin_grid_w
 
-            # bilinear_interpolation_gradient {{
-            y_low, x_low, y_high, x_high, w1, w2, w3, w4 = \
-                _get_bilinear_interp_params(y, x, height, width)
-            if y_low is None:
+            y, y_low, y_high = _get_bounds(y, height)
+            if y is None or y_low is None or y_high is None:
                 continue
+            x, x_low, x_high = _get_bounds(x, width)
+            if x is None or x_low is None or x_high is None:
+                continue
+
+            # bilinear_interpolation_gradient {{
+            w1, w2, w3, w4 = _get_bilinear_interp_params(
+                y, x, y_low, x_low, y_high, x_high)
 
             g1 = top_diff_this_bin * w1
             g2 = top_diff_this_bin * w2
@@ -465,16 +472,17 @@ def backward_gpu(self, inputs, gy):
                 static_cast<T>(ix + .5f) * bin_size_w /
                     static_cast<T>(roi_bin_grid_w);
 
-            int y_low, x_low, y_high, x_high;
+            int y_low, y_high;
+            bool y_ret = get_bounds(y, height, y_low, y_high);
+            if (!y_ret) continue;
+            int x_low, x_high;
+            bool x_ret = get_bounds(x, width, x_low, x_high);
+            if (!x_ret) continue;
+
+            // bilinear_interpolation_gradient {{
             T w1, w2, w3, w4;
-            bool ret = get_bilinear_interp_params(
-                x, y, height, width,
-                y_low, x_low, y_high, x_high,
-                w1, w2, w3, w4
-            );
-            if (!ret) {
-                continue;
-            }
+            get_bilinear_interp_params(
+                y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);
 
             T g1 = top_diff_this_bin * w1;
             T g2 = top_diff_this_bin * w2;

From 74d22742920c5540d48e1459b238d606ed0a264d Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 13:48:09 +0900
Subject: [PATCH 10/24] do more spatial_scale test for ps roi pooling/align
 functions

---
 tests/functions_tests/test_ps_roi_average_align_2d.py   | 2 +-
 tests/functions_tests/test_ps_roi_average_pooling_2d.py | 4 +++-
 tests/functions_tests/test_ps_roi_max_align.py          | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/functions_tests/test_ps_roi_average_align_2d.py b/tests/functions_tests/test_ps_roi_average_align_2d.py
index 64cb0d0435..6197c52db3 100644
--- a/tests/functions_tests/test_ps_roi_average_align_2d.py
+++ b/tests/functions_tests/test_ps_roi_average_align_2d.py
@@ -12,6 +12,7 @@
 
 @testing.parameterize(*testing.product({
     'sampling_ratio': [None, 1, 2, (None, 3), (1, 2)],
+    'spatial_scale': [0.6, 1.0, 2.0],
 }))
 class TestPSROIAverageAlign2D(unittest.TestCase):
 
@@ -36,7 +37,6 @@ def setUp(self):
         self.roi_indices = np.array([0, 2, 1, 0], dtype=np.int32)
         self.n_roi = self.rois.shape[0]
         self.out_h, self.out_w = 4, 4
-        self.spatial_scale = 1.0
         self.gy = np.random.uniform(
             -1, 1, (self.n_roi, self.out_c, self.out_h, self.out_w))
         self.gy = self.gy.astype(np.float32)
diff --git a/tests/functions_tests/test_ps_roi_average_pooling_2d.py b/tests/functions_tests/test_ps_roi_average_pooling_2d.py
index 1b1d74ced8..232438515c 100644
--- a/tests/functions_tests/test_ps_roi_average_pooling_2d.py
+++ b/tests/functions_tests/test_ps_roi_average_pooling_2d.py
@@ -10,6 +10,9 @@
 from chainercv import functions
 
 
+@testing.parameterize(*testing.product({
+    'spatial_scale': [0.6, 1.0, 2.0],
+}))
 class TestPSROIAveragePooling2D(unittest.TestCase):
 
     def setUp(self):
@@ -33,7 +36,6 @@ def setUp(self):
         self.roi_indices = np.array([0, 2, 1, 0], dtype=np.int32)
         self.n_roi = self.rois.shape[0]
         self.out_h, self.out_w = 4, 4
-        self.spatial_scale = 1.0
         self.gy = np.random.uniform(
             -1, 1, (self.n_roi, self.out_c, self.out_h, self.out_w))
         self.gy = self.gy.astype(np.float32)
diff --git a/tests/functions_tests/test_ps_roi_max_align.py b/tests/functions_tests/test_ps_roi_max_align.py
index a48fe4661c..06a36f6947 100644
--- a/tests/functions_tests/test_ps_roi_max_align.py
+++ b/tests/functions_tests/test_ps_roi_max_align.py
@@ -12,6 +12,7 @@
 
 @testing.parameterize(*testing.product({
     'sampling_ratio': [None, 1, 2, (None, 3), (1, 2)],
+    'spatial_scale': [0.6, 1.0, 2.0],
 }))
 class TestPSROIMaxAlign2D(unittest.TestCase):
 
@@ -36,7 +37,6 @@ def setUp(self):
         self.roi_indices = np.array([0, 2, 1, 0], dtype=np.int32)
         self.n_roi = self.rois.shape[0]
         self.out_h, self.out_w = 4, 4
-        self.spatial_scale = 1.0
         self.gy = np.random.uniform(
             -1, 1, (self.n_roi, self.out_c, self.out_h, self.out_w))
         self.gy = self.gy.astype(np.float32)

From d6c07ebb3060520b7385e435eb8717a9a9b9353e Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 14:37:24 +0900
Subject: [PATCH 11/24] refactor ps_roi_average/max_align_2d

---
 .../functions/ps_roi_average_align_2d.py      | 24 +++++++++----------
 chainercv/functions/ps_roi_max_align_2d.py    | 24 +++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
index 78fb632b27..1f27a0c039 100644
--- a/chainercv/functions/ps_roi_average_align_2d.py
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -180,13 +180,13 @@ def forward_cpu(self, inputs):
 
             roi_height = max(roi_end_h - roi_start_h, 1.)
             roi_width = max(roi_end_w - roi_start_w, 1.)
-            bin_size_h = 1. * roi_height / pooled_height
-            bin_size_w = 1. * roi_width / pooled_width
+            bin_size_h = roi_height / pooled_height
+            bin_size_w = roi_width / pooled_width
 
-            gh = np.floor(float(ph) * group_size / pooled_height)
-            gw = np.floor(float(pw) * group_size / pooled_width)
-            gh = int(min(max(gh, 0), group_size - 1))
-            gw = int(min(max(gw, 0), group_size - 1))
+            gh = int(np.floor(ph * group_size / pooled_height))
+            gw = int(np.floor(pw * group_size / pooled_width))
+            gh = min(max(gh, 0), group_size - 1)
+            gw = min(max(gw, 0), group_size - 1)
             c = (ctop * group_size + gh) * group_size + gw
 
             if self.sampling_ratio[0] is None:
@@ -380,13 +380,13 @@ def backward_cpu(self, inputs, gy):
 
             roi_width = max(roi_end_w - roi_start_w, 1.)
             roi_height = max(roi_end_h - roi_start_h, 1.)
-            bin_size_h = 1. * roi_height / pooled_height
-            bin_size_w = 1. * roi_width / pooled_width
+            bin_size_h = roi_height / pooled_height
+            bin_size_w = roi_width / pooled_width
 
-            gh = np.floor(float(ph) * group_size / pooled_height)
-            gw = np.floor(float(pw) * group_size / pooled_width)
-            gh = int(min(max(gh, 0), group_size - 1))
-            gw = int(min(max(gw, 0), group_size - 1))
+            gh = int(np.floor(ph * group_size / pooled_height))
+            gw = int(np.floor(pw * group_size / pooled_width))
+            gh = min(max(gh, 0), group_size - 1)
+            gw = min(max(gw, 0), group_size - 1)
             c = (ctop * group_size + gh) * group_size + gw
 
             top_diff_this_bin = top_diff[n, ctop, ph, pw]
diff --git a/chainercv/functions/ps_roi_max_align_2d.py b/chainercv/functions/ps_roi_max_align_2d.py
index b056297904..b011fabac5 100644
--- a/chainercv/functions/ps_roi_max_align_2d.py
+++ b/chainercv/functions/ps_roi_max_align_2d.py
@@ -123,13 +123,13 @@ def forward_cpu(self, inputs):
 
             roi_height = max(roi_end_h - roi_start_h, 1.)
             roi_width = max(roi_end_w - roi_start_w, 1.)
-            bin_size_h = 1. * roi_height / pooled_height
-            bin_size_w = 1. * roi_width / pooled_width
+            bin_size_h = roi_height / pooled_height
+            bin_size_w = roi_width / pooled_width
 
-            gh = np.floor(float(ph) * group_size / pooled_height)
-            gw = np.floor(float(pw) * group_size / pooled_width)
-            gh = int(min(max(gh, 0), group_size - 1))
-            gw = int(min(max(gw, 0), group_size - 1))
+            gh = int(np.floor(float(ph) * group_size / pooled_height))
+            gw = int(np.floor(float(pw) * group_size / pooled_width))
+            gh = min(max(gh, 0), group_size - 1)
+            gw = min(max(gw, 0), group_size - 1)
             c = (ctop * group_size + gh) * group_size + gw
 
             if self.sampling_ratio[0] is None:
@@ -334,13 +334,13 @@ def backward_cpu(self, inputs, gy):
 
             roi_width = max(roi_end_w - roi_start_w, 1.)
             roi_height = max(roi_end_h - roi_start_h, 1.)
-            bin_size_h = 1. * roi_height / pooled_height
-            bin_size_w = 1. * roi_width / pooled_width
+            bin_size_h = roi_height / pooled_height
+            bin_size_w = roi_width / pooled_width
 
-            gh = np.floor(float(ph) * group_size / pooled_height)
-            gw = np.floor(float(pw) * group_size / pooled_width)
-            gh = int(min(max(gh, 0), group_size - 1))
-            gw = int(min(max(gw, 0), group_size - 1))
+            gh = int(np.floor(float(ph) * group_size / pooled_height))
+            gw = int(np.floor(float(pw) * group_size / pooled_width))
+            gh = min(max(gh, 0), group_size - 1)
+            gw = min(max(gw, 0), group_size - 1)
             c = (ctop * group_size + gh) * group_size + gw
 
             top_diff_this_bin = top_diff[n, ctop, ph, pw]

From 8d4ab8c9803f63c0ffb52c8ce1b4d32d34d61549 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 16:05:19 +0900
Subject: [PATCH 12/24] fix typo in ps_roi_average_pooling_2d

---
 chainercv/functions/ps_roi_average_pooling_2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chainercv/functions/ps_roi_average_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
index 5c78b4b99d..481bff7a0a 100644
--- a/chainercv/functions/ps_roi_average_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -3,7 +3,7 @@
 # Copyright (c) 2018 Preferred Networks, Inc.
 # ------------------------------------------------------------------------
 
-# Original works of CUDA kernel in forward_gpu and forward_gpu:
+# Original works of CUDA kernel in forward_gpu and backward_gpu:
 # ------------------------------------------------------------------------
 # Copyright (c) 2017 Microsoft
 #

From 635ce7427459ae0acffb17d16e1031fa5b307cbe Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 17:27:51 +0900
Subject: [PATCH 13/24] update ps_roi_average_pooling_2d

---
 .../functions/ps_roi_average_pooling_2d.py    | 173 ++++++++++--------
 1 file changed, 98 insertions(+), 75 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
index 481bff7a0a..5ecf6d13b9 100644
--- a/chainercv/functions/ps_roi_average_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -89,47 +89,60 @@ def forward_cpu(self, inputs):
         self._bottom_data_shape = inputs[0].shape
 
         bottom_data, bottom_rois, bottom_roi_indices = inputs
-        channels, height, width = bottom_data.shape[1:]
+        height, width = bottom_data.shape[2:]
         n_roi = bottom_rois.shape[0]
         top_data = np.empty(
             (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
 
-        for i_roi in six.moves.range(n_roi):
-            y_min, x_min, y_max, x_max = bottom_rois[i_roi]
-            batch_index = bottom_roi_indices[i_roi]
-            y_min = round(y_min * self.spatial_scale)
-            x_min = round(x_min * self.spatial_scale)
-            y_max = round(y_max * self.spatial_scale)
-            x_max = round(x_max * self.spatial_scale)
-            roi_height = max(y_max - y_min, 0.1)
-            roi_width = max(x_max - x_min, 0.1)
-
-            stride_c = channels / self.out_c
-            stride_h = roi_height / self.out_h
-            stride_w = roi_width / self.out_w
-            group_h = int(round(self.out_h / self.group_size))
-            group_w = int(round(self.out_w / self.group_size))
-
-            for out_h in six.moves.range(self.out_h):
-                slice_h, len_h = _roi_pooling_slice(
-                    out_h, stride_h, height, int(y_min))
-                if slice_h.stop <= slice_h.start:
-                    continue
-                for out_w in six.moves.range(self.out_w):
-                    slice_w, len_w = _roi_pooling_slice(
-                        out_w, stride_w, width, int(x_min))
-                    if slice_w.stop <= slice_w.start:
-                        continue
-                    for out_c in six.moves.range(self.out_c):
-                        slice_c, len_c = _roi_pooling_slice(
-                            out_c, stride_c, channels, 0)
-                        roi_data = bottom_data[
-                            batch_index, slice_c, slice_h, slice_w]\
-                            .reshape((len_c, -1))
-                        c = (out_h // group_h) * self.group_size \
-                            + (out_w // group_w)
-                        top_data[i_roi, out_c, out_h, out_w] = np.average(
-                            roi_data[c])
+        group_size = self.group_size
+        pooled_dim, pooled_height, pooled_width \
+            = self.out_c, self.out_h, self.out_w
+        spatial_scale = self.spatial_scale
+
+        for i in six.moves.range(top_data.size):
+            pw = i % pooled_width
+            ph = int(i / pooled_width) % pooled_height
+            ctop = int(i / pooled_width / pooled_height) % pooled_dim
+            n = int(i / pooled_width / pooled_height / pooled_dim)
+
+            roi_batch_ind = bottom_roi_indices[n]
+            roi_start_h = bottom_rois[n, 0] * spatial_scale
+            roi_start_w = bottom_rois[n, 1] * spatial_scale
+            roi_end_h = bottom_rois[n, 2] * spatial_scale
+            roi_end_w = bottom_rois[n, 3] * spatial_scale
+
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
+            bin_size_h = roi_height / pooled_height
+            bin_size_w = roi_width / pooled_width
+
+            hstart = int(np.floor(ph * bin_size_h + roi_start_h))
+            wstart = int(np.floor(pw * bin_size_w + roi_start_w))
+            hend = int(np.floor((ph + 1) * bin_size_h + roi_start_h))
+            wend = int(np.floor((pw + 1) * bin_size_w + roi_start_w))
+            hstart = min(max(hstart, 0), height)
+            wstart = min(max(wstart, 0), width)
+            hend = min(max(hend, 0), height)
+            wend = min(max(wend, 0), width)
+
+            gh = int(np.floor(ph * group_size / pooled_height))
+            gw = int(np.floor(pw * group_size / pooled_width))
+            gh = min(max(gh, 0), group_size - 1)
+            gw = min(max(gw, 0), group_size - 1)
+            c = (ctop * group_size + gh) * group_size + gw
+
+            if hstart >= hend or wstart >= wend:
+                top_data[n, ctop, ph, pw] = 0
+                continue
+
+            output_val = 0.
+            count = (hend - hstart) * (wend - wstart)
+            for iy in six.moves.range(hstart, hend):
+                for ix in six.moves.range(wstart, wend):
+                    output_val += bottom_data[roi_batch_ind, c, iy, ix]
+            output_val /= count
+            top_data[n, ctop, ph, pw] = output_val
+
         return top_data,
 
     def forward_gpu(self, inputs):
@@ -222,46 +235,56 @@ def forward_gpu(self, inputs):
 
     def backward_cpu(self, inputs, gy):
         _, bottom_rois, bottom_roi_indices = inputs
-        channels, height, width = self._bottom_data_shape[1:]
-        n_roi = bottom_rois.shape[0]
+        top_diff = gy[0]
+        height, width = self._bottom_data_shape[2:]
         bottom_diff = np.zeros(self._bottom_data_shape, np.float32)
 
-        for i_roi in six.moves.range(n_roi):
-            y_min, x_min, y_max, x_max = bottom_rois[i_roi]
-            batch_index = bottom_roi_indices[i_roi]
-            y_min = round(y_min * self.spatial_scale)
-            x_min = round(x_min * self.spatial_scale)
-            y_max = round(y_max * self.spatial_scale)
-            x_max = round(x_max * self.spatial_scale)
-            roi_height = max(y_max - y_min, 0.1)
-            roi_width = max(x_max - x_min, 0.1)
-
-            stride_c = channels / self.out_c
-            stride_h = roi_height / self.out_h
-            stride_w = roi_width / self.out_w
-            group_h = int(round(self.out_h / self.group_size))
-            group_w = int(round(self.out_w / self.group_size))
-
-            for out_h in six.moves.range(self.out_h):
-                slice_h, len_h = _roi_pooling_slice(
-                    out_h, stride_h, height, int(y_min))
-                if slice_h.stop <= slice_h.start:
-                    continue
-                for out_w in six.moves.range(self.out_w):
-                    slice_w, len_w = _roi_pooling_slice(
-                        out_w, stride_w, width, int(x_min))
-                    if slice_w.stop <= slice_w.start:
-                        continue
-                    for out_c in six.moves.range(self.out_c):
-                        diff_val = gy[0][i_roi, out_c, out_h, out_w]
-                        diff_val = diff_val / len_h / len_w
-                        start_c = int(np.floor(out_c * stride_c))
-                        start_c = min(max(start_c, 0), channels)
-
-                        c = (out_h // group_h) * self.group_size \
-                            + (out_w // group_w) + start_c
-                        bottom_diff[batch_index, c, slice_h, slice_w] \
-                            += diff_val
+        group_size = self.group_size
+        pooled_dim, pooled_width, pooled_height \
+            = self.out_c, self.out_w, self.out_h
+        spatial_scale = self.spatial_scale
+
+        for i in six.moves.range(top_diff.size):
+            pw = i % pooled_width
+            ph = int(i / pooled_width) % pooled_height
+            ctop = int(i / pooled_width / pooled_height) % pooled_dim
+            n = int(i / pooled_width / pooled_height / pooled_dim)
+
+            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_start_h = bottom_rois[n, 0] * spatial_scale
+            roi_start_w = bottom_rois[n, 1] * spatial_scale
+            roi_end_h = bottom_rois[n, 2] * spatial_scale
+            roi_end_w = bottom_rois[n, 3] * spatial_scale
+
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
+            bin_size_h = roi_height / pooled_height
+            bin_size_w = roi_width / pooled_width
+
+            hstart = int(np.floor(ph * bin_size_h + roi_start_h))
+            wstart = int(np.floor(pw * bin_size_w + roi_start_w))
+            hend = int(np.floor((ph + 1) * bin_size_h + roi_start_h))
+            wend = int(np.floor((pw + 1) * bin_size_w + roi_start_w))
+            hstart = min(max(hstart, 0), height)
+            wstart = min(max(wstart, 0), width)
+            hend = min(max(hend, 0), height)
+            wend = min(max(wend, 0), width)
+
+            gh = int(np.floor(ph * group_size / pooled_height))
+            gw = int(np.floor(pw * group_size / pooled_width))
+            gh = min(max(gh, 0), group_size - 1)
+            gw = min(max(gw, 0), group_size - 1)
+            c = (ctop * group_size + gh) * group_size + gw
+
+            if (hstart >= hend) or (wstart >= wend):
+                continue
+
+            count = (hend - hstart) * (wend - wstart)
+            diff_val = top_diff[n, ctop, ph, pw] / count
+            for iy in six.moves.range(hstart, hend):
+                for ix in six.moves.range(wstart, wend):
+                    bottom_diff[roi_batch_ind, c, iy, ix] += diff_val
+
         return bottom_diff, None, None
 
     def backward_gpu(self, inputs, gy):

From 65710e5bdc0c5e917eed67b1fde68665022920f9 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 18:00:10 +0900
Subject: [PATCH 14/24] fix typo in ps_roi_average_pooling_2d

---
 chainercv/functions/ps_roi_average_pooling_2d.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
index 5ecf6d13b9..8d0c26c854 100644
--- a/chainercv/functions/ps_roi_average_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -118,8 +118,8 @@ def forward_cpu(self, inputs):
 
             hstart = int(np.floor(ph * bin_size_h + roi_start_h))
             wstart = int(np.floor(pw * bin_size_w + roi_start_w))
-            hend = int(np.floor((ph + 1) * bin_size_h + roi_start_h))
-            wend = int(np.floor((pw + 1) * bin_size_w + roi_start_w))
+            hend = int(np.ceil((ph + 1) * bin_size_h + roi_start_h))
+            wend = int(np.ceil((pw + 1) * bin_size_w + roi_start_w))
             hstart = min(max(hstart, 0), height)
             wstart = min(max(wstart, 0), width)
             hend = min(max(hend, 0), height)
@@ -263,8 +263,8 @@ def backward_cpu(self, inputs, gy):
 
             hstart = int(np.floor(ph * bin_size_h + roi_start_h))
             wstart = int(np.floor(pw * bin_size_w + roi_start_w))
-            hend = int(np.floor((ph + 1) * bin_size_h + roi_start_h))
-            wend = int(np.floor((pw + 1) * bin_size_w + roi_start_w))
+            hend = int(np.ceil((ph + 1) * bin_size_h + roi_start_h))
+            wend = int(np.ceil((pw + 1) * bin_size_w + roi_start_w))
             hstart = min(max(hstart, 0), height)
             wstart = min(max(wstart, 0), width)
             hend = min(max(hend, 0), height)

From 0aa99570ff53780af0e4f70a2d44881b9b606ddc Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 18:02:26 +0900
Subject: [PATCH 15/24] fix typo in ps_roi_max_align_2d and
 ps_roi_average_align_2d

---
 chainercv/functions/ps_roi_average_align_2d.py | 2 +-
 chainercv/functions/ps_roi_max_align_2d.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
index 1f27a0c039..cdcfefc83c 100644
--- a/chainercv/functions/ps_roi_average_align_2d.py
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -172,7 +172,7 @@ def forward_cpu(self, inputs):
             ctop = int(i / pooled_width / pooled_height) % pooled_dim
             n = int(i / pooled_width / pooled_height / pooled_dim)
 
-            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_batch_ind = bottom_roi_indices[n]
             roi_start_h = bottom_rois[n, 0] * spatial_scale
             roi_start_w = bottom_rois[n, 1] * spatial_scale
             roi_end_h = bottom_rois[n, 2] * spatial_scale
diff --git a/chainercv/functions/ps_roi_max_align_2d.py b/chainercv/functions/ps_roi_max_align_2d.py
index b011fabac5..6503740c45 100644
--- a/chainercv/functions/ps_roi_max_align_2d.py
+++ b/chainercv/functions/ps_roi_max_align_2d.py
@@ -115,7 +115,7 @@ def forward_cpu(self, inputs):
             ctop = int(i / pooled_width / pooled_height) % pooled_dim
             n = int(i / pooled_width / pooled_height / pooled_dim)
 
-            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_batch_ind = bottom_roi_indices[n]
             roi_start_h = bottom_rois[n, 0] * spatial_scale
             roi_start_w = bottom_rois[n, 1] * spatial_scale
             roi_end_h = bottom_rois[n, 2] * spatial_scale
@@ -326,7 +326,7 @@ def backward_cpu(self, inputs, gy):
             ctop = int(i / pooled_width / pooled_height) % pooled_dim
             n = int(i / pooled_width / pooled_height / pooled_dim)
 
-            roi_batch_ind = int(bottom_roi_indices[n])
+            roi_batch_ind = bottom_roi_indices[n]
             roi_start_h = bottom_rois[n, 0] * spatial_scale
             roi_start_w = bottom_rois[n, 1] * spatial_scale
             roi_end_h = bottom_rois[n, 2] * spatial_scale

From 7395fea7950ed5ae5955ead1f15b087c335aee39 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 18:16:23 +0900
Subject: [PATCH 16/24] refactor ps_roi_average_pooling_2d

---
 .../functions/ps_roi_average_pooling_2d.py    | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
index 8d0c26c854..e227652132 100644
--- a/chainercv/functions/ps_roi_average_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -94,10 +94,11 @@ def forward_cpu(self, inputs):
         top_data = np.empty(
             (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
 
-        group_size = self.group_size
-        pooled_dim, pooled_height, pooled_width \
-            = self.out_c, self.out_h, self.out_w
         spatial_scale = self.spatial_scale
+        pooled_dim = self.out_c
+        pooled_height = self.out_h
+        pooled_width = self.out_w
+        group_size = self.group_size
 
         for i in six.moves.range(top_data.size):
             pw = i % pooled_width
@@ -137,9 +138,9 @@ def forward_cpu(self, inputs):
 
             output_val = 0.
             count = (hend - hstart) * (wend - wstart)
-            for iy in six.moves.range(hstart, hend):
-                for ix in six.moves.range(wstart, wend):
-                    output_val += bottom_data[roi_batch_ind, c, iy, ix]
+            for y in six.moves.range(hstart, hend):
+                for x in six.moves.range(wstart, wend):
+                    output_val += bottom_data[roi_batch_ind, c, y, x]
             output_val /= count
             top_data[n, ctop, ph, pw] = output_val
 
@@ -239,10 +240,11 @@ def backward_cpu(self, inputs, gy):
         height, width = self._bottom_data_shape[2:]
         bottom_diff = np.zeros(self._bottom_data_shape, np.float32)
 
-        group_size = self.group_size
-        pooled_dim, pooled_width, pooled_height \
-            = self.out_c, self.out_w, self.out_h
         spatial_scale = self.spatial_scale
+        pooled_dim = self.out_c
+        pooled_height = self.out_h
+        pooled_width = self.out_w
+        group_size = self.group_size
 
         for i in six.moves.range(top_diff.size):
             pw = i % pooled_width
@@ -281,9 +283,9 @@ def backward_cpu(self, inputs, gy):
 
             count = (hend - hstart) * (wend - wstart)
             diff_val = top_diff[n, ctop, ph, pw] / count
-            for iy in six.moves.range(hstart, hend):
-                for ix in six.moves.range(wstart, wend):
-                    bottom_diff[roi_batch_ind, c, iy, ix] += diff_val
+            for y in six.moves.range(hstart, hend):
+                for x in six.moves.range(wstart, wend):
+                    bottom_diff[roi_batch_ind, c, y, x] += diff_val
 
         return bottom_diff, None, None
 

From f166c7eca212fb1b74af841e8a6adef845e08384 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 18:24:06 +0900
Subject: [PATCH 17/24] refactor ps_roi_average_align_2d

---
 chainercv/functions/ps_roi_average_align_2d.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
index cdcfefc83c..424b2fabd0 100644
--- a/chainercv/functions/ps_roi_average_align_2d.py
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -161,10 +161,11 @@ def forward_cpu(self, inputs):
         top_data = np.empty(
             (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
 
-        group_size = self.group_size
-        pooled_dim, pooled_width, pooled_height \
-            = self.out_c, self.out_w, self.out_h
         spatial_scale = self.spatial_scale
+        pooled_dim = self.out_c
+        pooled_height = self.out_h
+        pooled_width = self.out_w
+        group_size = self.group_size
 
         for i in six.moves.range(top_data.size):
             pw = i % pooled_width

From 098d5f274ed03b6cbdf06dd1a0af8b588012755a Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 18:24:12 +0900
Subject: [PATCH 18/24] fix typo in ps_roi_average_align_2d

---
 chainercv/functions/ps_roi_average_align_2d.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
index 424b2fabd0..8833247683 100644
--- a/chainercv/functions/ps_roi_average_align_2d.py
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -179,8 +179,8 @@ def forward_cpu(self, inputs):
             roi_end_h = bottom_rois[n, 2] * spatial_scale
             roi_end_w = bottom_rois[n, 3] * spatial_scale
 
-            roi_height = max(roi_end_h - roi_start_h, 1.)
-            roi_width = max(roi_end_w - roi_start_w, 1.)
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
             bin_size_h = roi_height / pooled_height
             bin_size_w = roi_width / pooled_width
 
@@ -379,8 +379,8 @@ def backward_cpu(self, inputs, gy):
             roi_end_h = bottom_rois[n, 2] * spatial_scale
             roi_end_w = bottom_rois[n, 3] * spatial_scale
 
-            roi_width = max(roi_end_w - roi_start_w, 1.)
-            roi_height = max(roi_end_h - roi_start_h, 1.)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
             bin_size_h = roi_height / pooled_height
             bin_size_w = roi_width / pooled_width
 

From 92afda6d475950216626995ba822009020573987 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 18:40:54 +0900
Subject: [PATCH 19/24] refactor cuda kernel in ps_roi_average_pooling_2d

---
 .../functions/ps_roi_average_pooling_2d.py    | 96 +++++++++----------
 1 file changed, 44 insertions(+), 52 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_pooling_2d.py b/chainercv/functions/ps_roi_average_pooling_2d.py
index e227652132..ae5074f6b5 100644
--- a/chainercv/functions/ps_roi_average_pooling_2d.py
+++ b/chainercv/functions/ps_roi_average_pooling_2d.py
@@ -157,14 +157,14 @@ def forward_gpu(self, inputs):
             (n_roi, self.out_c, self.out_h, self.out_w), dtype=np.float32)
         cuda.elementwise(
             '''
-            raw float32 bottom_data, raw float32 bottom_rois,
+            raw T bottom_data, raw T bottom_rois,
             raw int32 bottom_roi_indices,
-            float32 spatial_scale, int32 channels,
+            T spatial_scale, int32 channels,
             int32 height, int32 width,
             int32 pooled_dim, int32 pooled_height, int32 pooled_width,
             int32 group_size
             ''',
-            'float32 top_data',
+            'T top_data',
             '''
             // pos in output filter
             int ph = (i / pooled_width) % pooled_height;
@@ -173,31 +173,27 @@ def forward_gpu(self, inputs):
             int n = i / pooled_width / pooled_height / pooled_dim;
 
             int roi_batch_ind = bottom_roi_indices[n];
-            float roi_start_h = static_cast<float>(
-                round(bottom_rois[n * 4 + 0])) * spatial_scale;
-            float roi_start_w = static_cast<float>(
-                round(bottom_rois[n * 4 + 1])) * spatial_scale;
-            float roi_end_h = static_cast<float>(
-                round(bottom_rois[n * 4 + 2])) * spatial_scale;
-            float roi_end_w = static_cast<float>(
-                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
+            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
+            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
+            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;
 
             // Force too small ROIs to be 1x1
-            float roi_height = max(roi_end_h - roi_start_h, 0.1);
-            float roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+            T roi_height = max(roi_end_h - roi_start_h, 0.1);
+            T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
 
             // Compute w and h at bottom
-            float bin_size_h = roi_height / static_cast<float>(pooled_height);
-            float bin_size_w = roi_width / static_cast<float>(pooled_width);
-
-            int hstart = static_cast<int>(floor(static_cast<float>(ph)
-                                                * bin_size_h + roi_start_h));
-            int wstart = static_cast<int>(floor(static_cast<float>(pw)
-                                                * bin_size_w + roi_start_w));
-            int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
-                                             * bin_size_h + roi_start_h));
-            int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
-                                            * bin_size_w + roi_start_w));
+            T bin_size_h = roi_height / static_cast<T>(pooled_height);
+            T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+            int hstart = floor(
+                static_cast<T>(ph) * bin_size_h + roi_start_h);
+            int wstart = floor(
+                static_cast<T>(pw) * bin_size_w + roi_start_w);
+            int hend = ceil(
+                static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+            int wend = ceil(
+                static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
 
             // Add roi offsets and clip to input boundaries
             hstart = min(max(hstart, 0), height);
@@ -208,15 +204,15 @@ def forward_gpu(self, inputs):
 
             // Compute c at bottom
             int gh = floor(
-                static_cast<float>(ph) * group_size / pooled_height);
+                static_cast<T>(ph) * group_size / pooled_height);
             int gw = floor(
-                static_cast<float>(pw) * group_size / pooled_width);
+                static_cast<T>(pw) * group_size / pooled_width);
             gh = min(max(gh, 0), group_size - 1);
             gw = min(max(gw, 0), group_size - 1);
             int c = (ctop * group_size + gh) * group_size + gw;
 
             int data_offset = (roi_batch_ind * channels + c) * height * width;
-            float out_sum = 0;
+            T out_sum = 0;
             for (int h = hstart; h < hend; ++h){
               for (int w = wstart; w < wend; ++w){
                  int bottom_index = h * width + w;
@@ -224,8 +220,8 @@ def forward_gpu(self, inputs):
               }
             }
 
-            float bin_area = (hend - hstart) * (wend - wstart);
-            top_data = is_empty? (float) 0. : out_sum / bin_area;
+            T bin_area = (hend - hstart) * (wend - wstart);
+            top_data = is_empty? (T) 0. : out_sum / bin_area;
             ''', 'ps_roi_average_pooling_2d_fwd'
         )(bottom_data, bottom_rois, bottom_roi_indices,
           self.spatial_scale, channels, height, width,
@@ -295,13 +291,13 @@ def backward_gpu(self, inputs, gy):
         bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, np.float32)
         cuda.elementwise(
             '''
-            raw float32 top_diff, raw float32 bottom_rois,
+            raw T top_diff, raw T bottom_rois,
             raw int32 bottom_roi_indices,
-            float32 spatial_scale, int32 channels, int32 height, int32 width,
+            T spatial_scale, int32 channels, int32 height, int32 width,
             int32 pooled_dim, int32 pooled_height, int32 pooled_width,
             int32 group_size
             ''',
-            'raw float32 bottom_diff',
+            'raw T bottom_diff',
             '''
             int ph = (i / pooled_width) % pooled_height;
             int pw = i % pooled_width;
@@ -310,31 +306,27 @@ def backward_gpu(self, inputs, gy):
 
             // [start, end) interval for spatial sampling
             int roi_batch_ind = bottom_roi_indices[n];
-            float roi_start_h = static_cast<float>(
-                round(bottom_rois[n * 4 + 0])) * spatial_scale;
-            float roi_start_w = static_cast<float>(
-                round(bottom_rois[n * 4 + 1])) * spatial_scale;
-            float roi_end_h = static_cast<float>(
-                round(bottom_rois[n * 4 + 2])) * spatial_scale;
-            float roi_end_w = static_cast<float>(
-                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
+            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
+            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
+            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;
 
             // Force too small ROIs to be 1x1
-            float roi_height = max(roi_end_h - roi_start_h, 0.1);
-            float roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0
+            T roi_height = max(roi_end_h - roi_start_h, 0.1);
+            T roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0
 
             // Compute w and h at bottom
-            float bin_size_h = roi_height / static_cast<float>(pooled_height);
-            float bin_size_w = roi_width / static_cast<float>(pooled_width);
+            T bin_size_h = roi_height / static_cast<T>(pooled_height);
+            T bin_size_w = roi_width / static_cast<T>(pooled_width);
 
             int hstart = floor(
-                static_cast<float>(ph) * bin_size_h + roi_start_h);
+                static_cast<T>(ph) * bin_size_h + roi_start_h);
             int wstart = floor(
-                static_cast<float>(pw) * bin_size_w + roi_start_w);
+                static_cast<T>(pw) * bin_size_w + roi_start_w);
             int hend = ceil(
-                static_cast<float>(ph + 1.0) * bin_size_h + roi_start_h);
+                static_cast<T>(ph + 1.0) * bin_size_h + roi_start_h);
             int wend = ceil(
-                static_cast<float>(pw + 1.0) * bin_size_w + roi_start_w);
+                static_cast<T>(pw + 1.0) * bin_size_w + roi_start_w);
 
             // Add roi offsets and clip to input boundaries
             hstart = min(max(hstart, 0), height);
@@ -345,9 +337,9 @@ def backward_gpu(self, inputs, gy):
 
             // Compute c at bottom
             int gh = floor(
-                static_cast<float>(ph) * group_size / pooled_height);
+                static_cast<T>(ph) * group_size / pooled_height);
             int gw = floor(
-                static_cast<float>(pw) * group_size / pooled_width);
+                static_cast<T>(pw) * group_size / pooled_width);
             gh = min(max(gh, 0), group_size - 1);
             gw = min(max(gw, 0), group_size - 1);
             int c = (ctop * group_size + gh) * group_size + gw;
@@ -357,8 +349,8 @@ def backward_gpu(self, inputs, gy):
             int top_offset =
                 (n * pooled_dim + ctop) * pooled_height * pooled_width;
 
-            float bin_area = (hend - hstart) * (wend - wstart);
-            float diff_val = is_empty ? (float) 0. :
+            T bin_area = (hend - hstart) * (wend - wstart);
+            T diff_val = is_empty ? (T) 0. :
                 top_diff[top_offset + ph * pooled_width + pw] / bin_area;
             for (int h = hstart; h < hend; ++h){
               for (int w = wstart; w < wend; ++w){

From 7001c903485cadd146956e812e7c05e47040c0cd Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 18:41:11 +0900
Subject: [PATCH 20/24] refactor cuda kernel in ps_roi_average_align_2d

---
 .../functions/ps_roi_average_align_2d.py      | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/chainercv/functions/ps_roi_average_align_2d.py b/chainercv/functions/ps_roi_average_align_2d.py
index 8833247683..80d198cce9 100644
--- a/chainercv/functions/ps_roi_average_align_2d.py
+++ b/chainercv/functions/ps_roi_average_align_2d.py
@@ -268,14 +268,10 @@ def forward_gpu(self, inputs):
             int n = i / pooled_width / pooled_height / pooled_dim;
 
             int roi_batch_ind = bottom_roi_indices[n];
-            T roi_start_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 0])) * spatial_scale;
-            T roi_start_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 1])) * spatial_scale;
-            T roi_end_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 2])) * spatial_scale;
-            T roi_end_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
+            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
+            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
+            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;
 
             // Force too small ROIs to be 1x1
             T roi_height = max(roi_end_h - roi_start_h, 0.1);
@@ -465,14 +461,10 @@ def backward_gpu(self, inputs, gy):
 
             // Do not using rounding; this implementation detail is critical
             int roi_batch_ind = bottom_roi_indices[n];
-            T roi_start_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 0])) * spatial_scale;
-            T roi_start_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 1])) * spatial_scale;
-            T roi_end_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 2])) * spatial_scale;
-            T roi_end_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
+            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
+            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
+            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;
 
             // Force too small ROIs to be 1x1
             T roi_height = max(roi_end_h - roi_start_h, 0.1);

From 4dbb9136a9d24a100ded372b42cf62a609f6c709 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 19:21:06 +0900
Subject: [PATCH 21/24] refactor ps_roi_max_align_2d

---
 chainercv/functions/ps_roi_max_align_2d.py | 36 +++++++++-------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/chainercv/functions/ps_roi_max_align_2d.py b/chainercv/functions/ps_roi_max_align_2d.py
index 6503740c45..0907b52ca4 100644
--- a/chainercv/functions/ps_roi_max_align_2d.py
+++ b/chainercv/functions/ps_roi_max_align_2d.py
@@ -121,13 +121,13 @@ def forward_cpu(self, inputs):
             roi_end_h = bottom_rois[n, 2] * spatial_scale
             roi_end_w = bottom_rois[n, 3] * spatial_scale
 
-            roi_height = max(roi_end_h - roi_start_h, 1.)
-            roi_width = max(roi_end_w - roi_start_w, 1.)
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
             bin_size_h = roi_height / pooled_height
             bin_size_w = roi_width / pooled_width
 
-            gh = int(np.floor(float(ph) * group_size / pooled_height))
-            gw = int(np.floor(float(pw) * group_size / pooled_width))
+            gh = int(np.floor(ph * group_size / pooled_height))
+            gw = int(np.floor(pw * group_size / pooled_width))
             gh = min(max(gh, 0), group_size - 1)
             gw = min(max(gw, 0), group_size - 1)
             c = (ctop * group_size + gh) * group_size + gw
@@ -216,14 +216,10 @@ def forward_gpu(self, inputs):
             int n = i / pooled_width / pooled_height / pooled_dim;
 
             int roi_batch_ind = bottom_roi_indices[n];
-            T roi_start_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 0])) * spatial_scale;
-            T roi_start_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 1])) * spatial_scale;
-            T roi_end_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 2])) * spatial_scale;
-            T roi_end_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
+            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
+            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
+            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;
 
             // Force too small ROIs to be 1x1
             T roi_height = max(roi_end_h - roi_start_h, 0.1);
@@ -332,8 +328,8 @@ def backward_cpu(self, inputs, gy):
             roi_end_h = bottom_rois[n, 2] * spatial_scale
             roi_end_w = bottom_rois[n, 3] * spatial_scale
 
-            roi_width = max(roi_end_w - roi_start_w, 1.)
-            roi_height = max(roi_end_h - roi_start_h, 1.)
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
             bin_size_h = roi_height / pooled_height
             bin_size_w = roi_width / pooled_width
 
@@ -419,14 +415,10 @@ def backward_gpu(self, inputs, gy):
 
             // Do not using rounding; this implementation detail is critical
             int roi_batch_ind = bottom_roi_indices[n];
-            T roi_start_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 0])) * spatial_scale;
-            T roi_start_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 1])) * spatial_scale;
-            T roi_end_h = static_cast<T>(
-                round(bottom_rois[n * 4 + 2])) * spatial_scale;
-            T roi_end_w = static_cast<T>(
-                round(bottom_rois[n * 4 + 3])) * spatial_scale;
+            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
+            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
+            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
+            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;
 
             // Force too small ROIs to be 1x1
             T roi_height = max(roi_end_h - roi_start_h, 0.1);

From 86c33ba849869edfe9563d08ca4e9f970a1666c5 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 19:41:44 +0900
Subject: [PATCH 22/24] fix typo in ps_roi_max_align_2d

---
 chainercv/functions/ps_roi_max_align_2d.py | 173 +++++++++++----------
 1 file changed, 89 insertions(+), 84 deletions(-)

diff --git a/chainercv/functions/ps_roi_max_align_2d.py b/chainercv/functions/ps_roi_max_align_2d.py
index 0907b52ca4..a679d9b38c 100644
--- a/chainercv/functions/ps_roi_max_align_2d.py
+++ b/chainercv/functions/ps_roi_max_align_2d.py
@@ -340,47 +340,49 @@ def backward_cpu(self, inputs, gy):
             c = (ctop * group_size + gh) * group_size + gw
 
             top_diff_this_bin = top_diff[n, ctop, ph, pw]
+            maxidx = self.argmax_data[n, ctop, ph, pw]
 
-            if self.sampling_ratio[0] is None:
-                roi_bin_grid_h = int(np.ceil(roi_height / pooled_height))
-            else:
-                roi_bin_grid_h = self.sampling_ratio[0]
-            if self.sampling_ratio[1] is None:
-                roi_bin_grid_w = int(np.ceil(roi_width / pooled_width))
-            else:
-                roi_bin_grid_w = self.sampling_ratio[1]
+            if maxidx != -1:
+                if self.sampling_ratio[0] is None:
+                    roi_bin_grid_h = int(np.ceil(roi_height / pooled_height))
+                else:
+                    roi_bin_grid_h = self.sampling_ratio[0]
+                if self.sampling_ratio[1] is None:
+                    roi_bin_grid_w = int(np.ceil(roi_width / pooled_width))
+                else:
+                    roi_bin_grid_w = self.sampling_ratio[1]
 
-            maxidx = self.argmax_data[n, ctop, ph, pw]
-            iy = int(maxidx / roi_bin_grid_w)
-            ix = maxidx % roi_bin_grid_w
-
-            y = roi_start_h + ph * bin_size_h + \
-                (iy + .5) * bin_size_h / roi_bin_grid_h
-            x = roi_start_w + pw * bin_size_w + \
-                (ix + .5) * bin_size_w / roi_bin_grid_w
-
-            y, y_low, y_high = _get_bounds(y, height)
-            if y is None or y_low is None or y_high is None:
-                continue
-            x, x_low, x_high = _get_bounds(x, width)
-            if x is None or x_low is None or x_high is None:
-                continue
-
-            # bilinear_interpolation_gradient {{
-            w1, w2, w3, w4 = _get_bilinear_interp_params(
-                y, x, y_low, x_low, y_high, x_high)
-
-            g1 = top_diff_this_bin * w1
-            g2 = top_diff_this_bin * w2
-            g3 = top_diff_this_bin * w3
-            g4 = top_diff_this_bin * w4
-
-            if (x_low >= 0 and x_high >= 0 and
-                    y_low >= 0 and y_high >= 0):
-                bottom_diff[roi_batch_ind, c, y_low, x_low] += g1
-                bottom_diff[roi_batch_ind, c, y_low, x_high] += g2
-                bottom_diff[roi_batch_ind, c, y_high, x_low] += g3
-                bottom_diff[roi_batch_ind, c, y_high, x_high] += g4
+                iy = int(maxidx / roi_bin_grid_w)
+                ix = maxidx % roi_bin_grid_w
+
+                y = roi_start_h + ph * bin_size_h + \
+                    (iy + .5) * bin_size_h / roi_bin_grid_h
+                x = roi_start_w + pw * bin_size_w + \
+                    (ix + .5) * bin_size_w / roi_bin_grid_w
+
+                y, y_low, y_high = _get_bounds(y, height)
+                if y is None or y_low is None or y_high is None:
+                    continue
+                x, x_low, x_high = _get_bounds(x, width)
+                if x is None or x_low is None or x_high is None:
+                    continue
+
+                # bilinear_interpolation_gradient {{
+                w1, w2, w3, w4 = _get_bilinear_interp_params(
+                    y, x, y_low, x_low, y_high, x_high)
+
+                g1 = top_diff_this_bin * w1
+                g2 = top_diff_this_bin * w2
+                g3 = top_diff_this_bin * w3
+                g4 = top_diff_this_bin * w4
+
+                if (x_low >= 0 and x_high >= 0 and
+                        y_low >= 0 and y_high >= 0):
+                    bottom_diff[roi_batch_ind, c, y_low, x_low] += g1
+                    bottom_diff[roi_batch_ind, c, y_low, x_high] += g2
+                    bottom_diff[roi_batch_ind, c, y_high, x_low] += g3
+                    bottom_diff[roi_batch_ind, c, y_high, x_high] += g4
+                # }}
 
         return bottom_diff, None, None
 
@@ -444,53 +446,56 @@ def backward_gpu(self, inputs, gy):
                 (n * pooled_dim + ctop) * pooled_height * pooled_width;
             T top_diff_this_bin =
                 top_diff[top_offset + ph * pooled_width + pw];
+            int maxidx = argmax_data[top_offset + ph * pooled_width + pw];
 
-            // We use roi_bin_grid to sample the grid and mimic integral
-            int roi_bin_grid_h = (sampling_ratio_h > 0)
-                ? sampling_ratio_h
-                : ceil(roi_height / pooled_height); // e.g. = 2
-            int roi_bin_grid_w = (sampling_ratio_w > 0)
-                ? sampling_ratio_w
-                : ceil(roi_width / pooled_width);
+            if (maxidx != -1) {
+                // We use roi_bin_grid to sample the grid and mimic integral
+                int roi_bin_grid_h = (sampling_ratio_h > 0)
+                    ? sampling_ratio_h
+                    : ceil(roi_height / pooled_height); // e.g. = 2
+                int roi_bin_grid_w = (sampling_ratio_w > 0)
+                    ? sampling_ratio_w
+                    : ceil(roi_width / pooled_width);
 
-            int maxidx = argmax_data[top_offset + ph * pooled_width + pw];
-            int iy = maxidx / roi_bin_grid_w;
-            int ix = maxidx % roi_bin_grid_w;
-
-            T y = roi_start_h + ph * bin_size_h +
-                static_cast<T>(iy + .5f) * bin_size_h /
-                    static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
-            T x = roi_start_w + pw * bin_size_w +
-                static_cast<T>(ix + .5f) * bin_size_w /
-                    static_cast<T>(roi_bin_grid_w);
-
-            int y_low, y_high;
-            bool y_ret = get_bounds(y, height, y_low, y_high);
-            if (!y_ret) continue;
-            int x_low, x_high;
-            bool x_ret = get_bounds(x, width, x_low, x_high);
-            if (!x_ret) continue;
-
-            // bilinear_interpolation_gradient {{
-            T w1, w2, w3, w4;
-            get_bilinear_interp_params(
-                y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);
-
-            T g1 = top_diff_this_bin * w1;
-            T g2 = top_diff_this_bin * w2;
-            T g3 = top_diff_this_bin * w3;
-            T g4 = top_diff_this_bin * w4;
-
-            if (x_low >= 0 && x_high >= 0 &&
-                    y_low >= 0 && y_high >= 0) {
-                atomicAdd(&bottom_diff[bottom_diff_offset +
-                                       y_low * width + x_low], g1);
-                atomicAdd(&bottom_diff[bottom_diff_offset +
-                                       y_low * width + x_high], g2);
-                atomicAdd(&bottom_diff[bottom_diff_offset +
-                                       y_high * width + x_low], g3);
-                atomicAdd(&bottom_diff[bottom_diff_offset +
-                                       y_high * width + x_high], g4);
+                int iy = maxidx / roi_bin_grid_w;
+                int ix = maxidx % roi_bin_grid_w;
+
+                T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
+                T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+                int y_low, y_high;
+                bool y_ret = get_bounds(y, height, y_low, y_high);
+                if (!y_ret) continue;
+                int x_low, x_high;
+                bool x_ret = get_bounds(x, width, x_low, x_high);
+                if (!x_ret) continue;
+
+                // bilinear_interpolation_gradient {{
+                T w1, w2, w3, w4;
+                get_bilinear_interp_params(
+                    y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);
+
+                T g1 = top_diff_this_bin * w1;
+                T g2 = top_diff_this_bin * w2;
+                T g3 = top_diff_this_bin * w3;
+                T g4 = top_diff_this_bin * w4;
+
+                if (x_low >= 0 && x_high >= 0 &&
+                        y_low >= 0 && y_high >= 0) {
+                    atomicAdd(&bottom_diff[bottom_diff_offset +
+                                           y_low * width + x_low], g1);
+                    atomicAdd(&bottom_diff[bottom_diff_offset +
+                                           y_low * width + x_high], g2);
+                    atomicAdd(&bottom_diff[bottom_diff_offset +
+                                           y_high * width + x_low], g3);
+                    atomicAdd(&bottom_diff[bottom_diff_offset +
+                                           y_high * width + x_high], g4);
+                }
+                // }}
             }
             ''',
             'ps_roi_max_align_2d_bwd',

From 64b8ec4fcab84c8ac13254e33ed8c785b73fded0 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 20:02:45 +0900
Subject: [PATCH 23/24] set initial maxval as - np.inf for ps_roi_max_align_2d

---
 chainercv/functions/ps_roi_max_align_2d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chainercv/functions/ps_roi_max_align_2d.py b/chainercv/functions/ps_roi_max_align_2d.py
index a679d9b38c..66a648129d 100644
--- a/chainercv/functions/ps_roi_max_align_2d.py
+++ b/chainercv/functions/ps_roi_max_align_2d.py
@@ -141,7 +141,7 @@ def forward_cpu(self, inputs):
             else:
                 roi_bin_grid_w = self.sampling_ratio[1]
 
-            maxval = -1e20
+            maxval = - np.inf
             maxidx = -1
             for iy in six.moves.range(roi_bin_grid_h):
                 y = roi_start_h + ph * bin_size_h + \
@@ -249,7 +249,7 @@ def forward_gpu(self, inputs):
                 ? sampling_ratio_w
                 : ceil(roi_width / pooled_width);
 
-            T maxval = -1E+20;
+            T maxval = - (T) (1.0 / 0.0);
             int maxidx = -1;
             for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g. iy = 0, 1
             {

From a2cfcc19a6b65f7528d4f1597dffb765cbc41cd0 Mon Sep 17 00:00:00 2001
From: Shingo Kitagawa <shingogo@hotmail.co.jp>
Date: Mon, 18 Feb 2019 20:03:07 +0900
Subject: [PATCH 24/24] fix ps_roi_max_align_2d test to pass

---
 tests/functions_tests/test_ps_roi_max_align.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/functions_tests/test_ps_roi_max_align.py b/tests/functions_tests/test_ps_roi_max_align.py
index 06a36f6947..8998a3652f 100644
--- a/tests/functions_tests/test_ps_roi_max_align.py
+++ b/tests/functions_tests/test_ps_roi_max_align.py
@@ -1,5 +1,6 @@
 import chainer
 from chainer.backends import cuda
+import chainer.functions as F
 from chainer import gradient_check
 from chainer import testing
 from chainer.testing import attr
@@ -68,10 +69,15 @@ def test_forward_gpu(self):
 
     def check_backward(self, x_data, roi_data, roi_index_data, y_grad_data):
         def f(x, rois, roi_indices):
-            return functions.ps_roi_max_align_2d(
+            y = functions.ps_roi_max_align_2d(
                 x, rois, roi_indices, self.out_c, self.out_h, self.out_w,
                 self.spatial_scale, self.group_size,
                 sampling_ratio=self.sampling_ratio)
+            xp = cuda.get_array_module(y)
+            y = F.where(
+                xp.isinf(y.array), xp.zeros(y.shape, dtype=y.dtype), y)
+            return y
+
         gradient_check.check_backward(
             f, (x_data, roi_data, roi_index_data), y_grad_data,
             no_grads=[False, True, True], **self.check_backward_options)