alibaba · jxt1234 · Jul 26, 2024 · Jul 23, 2024 · Jul 22, 2024 · Jul 25, 2024
diff --git a/docs/faq.md b/docs/faq.md
@@ -117,7 +117,15 @@ opConverter ==> MNN Converter NOT_SUPPORTED_OP: [ ANY_OP_NAME ]
 临时解决方案：升级 numpy 版本到 1.20.0 或以上
 
 ## 运行问题
-### 运行结果出错 / Tensor 的 elementSize 不为各维度乘积
+### 运行结果出错
+- 先使用 testMNNFromOnnx.py 等测试工具进行测试，具体参见模型转换工具的正确性校验部分
+- 测试工具验证正确，但运行代码结果出错，可能是如下原因：
+    1. 使用 Session API 运行不满足运行条件的模型，此时应换用 Module API
+    2. 输入的内存布局不对
+    3. 输入数据格式不对，int64 需要换成 int32_t ，double 需要换成 float
+
+
+### 布局转换问题(Tensor 的 elementSize 不为各维度乘积)
 MNN 内部对 CV 相关算子采用 NC4HW4 布局，计算 elementSize 时，channel 会上对齐到 4 返回，此内存布局允许实现的硬件自行确定内存排列方式，具体方式不对用户可见，但用户可以通过如下代码，输入或获取自己指定的NCHW / NHWC 内存布局的 Tensor / VARP。
 
 #### Interpreter-Session API
@@ -237,6 +245,9 @@ OpenCL / Vulkan 采用静态变量自注册的方式往 MNN 主库注册后端.
    - 目前支持OpenCL和CUDA后端进行设置
    - 具体可以参考：tools/cpp/testModel.cpp
 
+### Register 相关内存泄露说明
+用 valgrind 工具检查时会报 MNN Register 相关的内存泄露，这个属于一次性的初始化内存，后续也不会增加，可视为误报
+
 
 ## 性能相关
 ### 使用 GPU 时，调用 copyToHostTensor / copyFromHostTensor 非常慢

diff --git a/docs/tools/convert.md b/docs/tools/convert.md
@@ -145,7 +145,13 @@ model_script.save('model_script.pt')
 - testMNNFromOnnx.py ：适用 onnx
 - testMNNFromTorch.py ：适用 pt (torchscript)
 
-注意：对于由Torchscript转换的模型，需要自行修改`testMNNFromTorch.py`中的的输入信息来测试
+注意：
+
+- 如果模型是动态输入形状，MNN 在脚本中默认不固定部分为1，有可能在 Tensorflow / OnnxRuntime / Torch 验证阶段报错。此时需要修改脚本中对应的输入部分，比如 testMNNFromOnnx.py 中的 run_onnx(self) 函数，把输入替换为有效的输入形状和内容。
+- 对于由Torchscript转换的模型，一般都需要自行修改`testMNNFromTorch.py`中的的输入信息来测试。
+- 如果模型输出层是 Identity 产生的，会因为 MNN 图优化的缘故丢失，此时需要校验上一层的输出，即在脚本后接输出名来测试，如: python3 ../tools/scripts/testMNNFromTf.py XXX.pb $NAME$
+
+
 ### 前置
 - 测试 pb / tflite ：安装`tensorflow`(`pip install tensorflow`）
 - 测试 onnx : 安装`onnxruntime`(`pip install onnxruntime`）

diff --git a/source/backend/cpu/arm/arm64/MNNSamplerC4BilinearOpt.S b/source/backend/cpu/arm/arm64/MNNSamplerC4BilinearOpt.S
@@ -90,31 +90,31 @@ L1:
 cmp x3, #0
 beq End
 mov v16.s[0], w4
-mov v16.s[1], w5
+mov v16.s[1], w5 // v16:[xMax, yMax]
 mov w12, #4
-mov v7.s[0], w12
-mov v7.s[1], w6
+mov v7.s[0], w12 // bpp=4
+mov v7.s[1], w6 // yStride
 dup v20.2d, x0
 
 L1Loop:
 
-fcvtms v2.2s, v0.2s
+fcvtzs v2.2s, v0.2s // [x0, y0]
 frintm v4.2s, v0.2s
-smax v2.2s, v2.2s, v19.2s
-fcvtps v3.2s, v0.2s
-fabd v4.2s, v0.2s, v4.2s
+smax v2.2s, v2.2s, v19.2s // max(0, y)
+fcvtps v3.2s, v0.2s // [x1, y1]
+fabd v4.2s, v0.2s, v4.2s // (xF, yF)
 smax v3.2s, v3.2s, v19.2s
 smin v2.2s, v2.2s, v16.2s
 smin v3.2s, v3.2s, v16.2s
-mul v2.2s, v2.2s, v7.2s
-mul v3.2s, v3.2s, v7.2s
-mov v2.s[2], v3.s[0]
-mov v3.s[2], v2.s[0]
+mul v2.2s, v2.2s, v7.2s // [bpp * x0, y0 * yStride]
+mul v3.2s, v3.2s, v7.2s // [bpp * x1, y1 * yStride]
+mov v2.s[2], v3.s[0] // v2: [bpp*x0, y0*yStride, bpp*x1, y0*yStride]
+mov v3.s[2], v2.s[0] // v3: [bpp*x1, y1*yStride, bpp*x0, y1*yStride]
 mov v2.s[3], v2.s[1]
 mov v3.s[3], v3.s[1]
 
-uaddlp v2.2d, v2.4s
-uaddlp v3.2d, v3.4s
+uaddlp v2.2d, v2.4s // [c00, c01]
+uaddlp v3.2d, v3.4s // [c11, c10]
 
 add v2.2d, v20.2d, v2.2d
 add v3.2d, v20.2d, v3.2d
@@ -131,25 +131,25 @@ uxtl v6.8h, v6.8b
 //Now v2, v3 is of no use
 
 //v2: LT, v3: RT, v5: LB, v6:BT
-uxtl v2.4s, v5.4h
-uxtl2 v3.4s, v5.8h
+uxtl v2.4s, v5.4h // c00
+uxtl2 v3.4s, v5.8h // c01
 
 ucvtf v2.4s, v2.4s
-uxtl v5.4s, v6.4h
+uxtl v5.4s, v6.4h // c11
 ucvtf v3.4s, v3.4s
-uxtl2 v6.4s, v6.8h
+uxtl2 v6.4s, v6.8h // c10
 ucvtf v5.4s, v5.4s
 ucvtf v6.4s, v6.4s
 
 fsub v3.4s, v3.4s, v2.4s
-fsub v6.4s, v6.4s, v5.4s
-fmla v2.4s, v3.4s, v4.s[0]
-fmla v5.4s, v6.4s, v4.s[0]
+fsub v5.4s, v5.4s, v6.4s
+fmla v2.4s, v3.4s, v4.s[0] // (c01-c00)*xF+c00
+fmla v6.4s, v5.4s, v4.s[0] // (c11-c10)*xF+c10
 
-fsub v5.4s, v5.4s, v2.4s
-fmla v2.4s, v5.4s, v4.s[1]
+fsub v6.4s, v6.4s, v2.4s
+fmla v2.4s, v6.4s, v4.s[1]
 
-fcvtns v2.4s, v2.4s
+fcvtzs v2.4s, v2.4s
 uqxtn v2.4h, v2.4s
 uqxtn v2.8b, v2.8h
 

diff --git a/source/backend/cpu/arm/arm64/MNNSamplerC4NearestOpt.S b/source/backend/cpu/arm/arm64/MNNSamplerC4NearestOpt.S
@@ -44,21 +44,21 @@ mov v5.s[2], v3.s[1]
 mov v4.s[3], v2.s[0]
 mov v5.s[3], v2.s[1]
 
-dup v23.4s, w6
+dup v23.4s, w6 // yStride
 movi v24.4s, #4
 dup v22.2d, x0
 
 L4Loop:
-fcvtns v6.4s, v4.4s
-fcvtns v7.4s, v5.4s
+fcvtas v6.4s, v4.4s // x
+fcvtas v7.4s, v5.4s // y
 
 smin v6.4s, v6.4s, v16.4s
 smin v7.4s, v7.4s, v17.4s
 smax v6.4s, v6.4s, v19.4s
 smax v7.4s, v7.4s, v19.4s
 
 mul v7.4s, v7.4s, v23.4s
-mla v7.4s, v6.4s, v24.4s
+mla v7.4s, v6.4s, v24.4s // offset = y * yStride + bpp * x
 uxtl v6.2d, v7.2s
 uxtl2 v7.2d, v7.4s
 add v6.2d, v6.2d, v22.2d
@@ -95,7 +95,7 @@ mov w12, #4
 
 L1Loop:
 
-fcvtns v2.2s, v0.2s
+fcvtas v2.2s, v0.2s
 smax v2.2s, v2.2s, v19.2s
 smin v2.2s, v2.2s, v6.2s
 mov w4, v2.s[0]

diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@@ -37,21 +37,21 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, co
     return NO_ERROR;
 }
 
-void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount) {
+void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) {
     auto weightDst = weight->host<uint8_t>();
     memset(weightDst, 0, weight->size());
-    if (SRC_UNIT > UNIT) {
-        auto icDivU = UP_DIV(ic, UNIT);
+    if (SRC_UNIT > pack) {
+        auto icDivU = UP_DIV(ic, pack);
         for (int k = 0; k < kernelCount; ++k) {
             const auto srcK = weightSrc + k;
             for (int y = 0; y < ic; ++y) {
-                const int yOutSide    = y / UNIT;
-                const int yInSide     = y % UNIT;
+                const int yOutSide    = y / pack;
+                const int yInSide     = y % pack;
                 const int yIndex      = yOutSide + k * icDivU;
-                const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
-                const int ySubInSide  = yIndex % (SRC_UNIT / UNIT);
+                const int ySubOutSide = yIndex / (SRC_UNIT / pack);
+                const int ySubInSide  = yIndex % (SRC_UNIT / pack);
 
-                auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
+                auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide;
                 const auto srcY = srcK + y * kernelCount;
                 for (int x = 0; x < oc; ++x) {
                     const int xOutSide = x / UNIT;
@@ -94,9 +94,13 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
     // reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)]
     int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
     std::vector<int> shape;
-    if (SRC_UNIT > UNIT) {
+    int pack = gcore->pack;
+    if (gcore->bytes == 2 && gcore->pack == 8) {
+        pack = 4;
+    }
+    if (SRC_UNIT > pack) {
         MNN_ASSERT(SRC_UNIT % UNIT == 0);
-        shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
+        shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT};
     } else {
         shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
     }
@@ -108,7 +112,7 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
         MNN_ERROR("Memory not enough");
         return false;
     }
-    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount);
+    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack);
     return true;
 }
 

diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@@ -23,7 +23,7 @@ class ConvInt8TiledExecutor : public CPUConvolution {
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
     virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
-    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount);
+    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack);
 
 protected:
     ConvolutionCommon::Im2ColParameter mIm2ColParamter;

diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
@@ -122,12 +122,7 @@ void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColPara
         int UNIT, SRC_UNIT, DynamicDestUnit;
         auto core = int8Core;
         core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit);
-        if (floatCore->bytes == 2 && DynamicDestUnit == 20) {
-            UNIT = 8;
-            SRC_UNIT= 8;
-            DynamicDestUnit = 10;
-        }
-        if (SRC_UNIT > UNIT) {
+        if (SRC_UNIT > pack) {
             const auto srcCountUnit = UP_DIV(input->channel(), pack);
             dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack);
             dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack;

diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/source/backend/cpu/compute/GemmInt8Executor.cpp
@@ -82,12 +82,12 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
     mIm2ColParamter.padX            = 0;
     mIm2ColParamter.padY            = 0;
     mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT);
-    if (SRC_UNIT > UNIT___) {
+    if (SRC_UNIT > UNIT___ && UNIT___ == pack) {
         const auto srcCountUnit = UP_DIV(input->channel(), pack);
         mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
     } else {
         const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
-        mIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
+        mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
     }
 
     mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT);

diff --git a/source/backend/cpu/compute/IdstConvolutionInt8.cpp b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
@@ -65,7 +65,7 @@ IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Back
     auto kernelCount        = kx * ky;
     auto srcCount           = mSrcCount;
     std::vector<int> shape;
-    if (SRC_UNIT > UNIT) {
+    if (SRC_UNIT > UNIT && UNIT == PackUnit) {
         MNN_ASSERT(SRC_UNIT % UNIT == 0);
         shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
     } else {
@@ -81,7 +81,7 @@ IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Back
         MNN_ERROR("Memory not enough\n");
         return;
     }
-    ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount);
+    ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount, PackUnit);
     ::memset(mFakeBias->host<float>(), 0, mFakeBias->size());
     ::memset(mFakeWeightBias->host<float>(), 0, mFakeWeightBias->size());
 #ifdef MNN_USE_SSE

diff --git a/test/cv/ImageProcessTest.cpp b/test/cv/ImageProcessTest.cpp
@@ -696,3 +696,111 @@ class ImageProcessYUVBlitterTest : public ImageProcessYUVTestCommmon {
 };
 // {YUV_NV21, YUV_NV12, YUV_I420} -> {RGBA, RGB, BGRA, BGR, GRAY} unit test
 MNNTestSuiteRegister(ImageProcessYUVBlitterTest, "cv/image_process/yuv_blitter");
+
+static bool funcToColorResize(int iw, int ih, int ic, int ow, int oh, int oc, Filter filtertype, ImageFormat srcFormat, ImageFormat dstFormat) {
+    auto srcImg = genSourceData(ih, iw, ic);
+    auto dstType = halide_type_of<uint8_t>();
+
+    float fx = static_cast<float>(iw) / ow;
+    float fy = static_cast<float>(ih) / oh;
+    ImageProcess::Config config0, config1;
+
+    // resize first
+    config0.sourceFormat = srcFormat;
+    config0.destFormat = srcFormat;
+    config0.filterType = filtertype;
+    std::unique_ptr<ImageProcess> process0(ImageProcess::create(config0));
+    auto resizeTensor = Tensor::create({1, oh, ow, ic}, dstType);
+    Matrix tr;
+    tr.postScale(fx, fy);
+    tr.postTranslate(0.5 * (fx - 1), 0.5 * (fy - 1));
+    process0->setMatrix(tr);
+    process0->convert(srcImg.data(), iw, ih, 0, resizeTensor->host<uint8_t>(), ow, oh, ic, 0, dstType);
+
+    // then convert color
+    config1.sourceFormat = srcFormat;
+    config1.destFormat = dstFormat;
+    config1.filterType = filtertype;
+    std::unique_ptr<ImageProcess> process1(ImageProcess::create(config1));
+    auto colorTensor = Tensor::create({1, oh, ow, oc}, dstType);
+    Matrix tr1;
+    tr1.postScale(1.f, 1.f);
+    tr1.postTranslate(0, 0);
+    process1->setMatrix(tr1);
+    process1->convert(resizeTensor->host<uint8_t>(), ow, oh, 0, colorTensor->host<uint8_t>(), ow, oh, oc, 0, dstType);
+
+    // convert color first
+    ImageProcess::Config config2, config3;
+    config2.sourceFormat = srcFormat;
+    config2.destFormat = dstFormat;
+    config2.filterType = filtertype;
+
+    std::unique_ptr<ImageProcess> process2(ImageProcess::create(config2));
+    auto colorTensor2 = Tensor::create({1, ih, iw, oc}, dstType);
+    Matrix tr2;
+    tr2.postScale(1.f, 1.f);
+    tr2.postTranslate(0.f, 0.f);
+    process2->setMatrix(tr2);
+    process2->convert(srcImg.data(), iw, ih, 0, colorTensor2->host<uint8_t>(), iw, ih, oc, 0, dstType);
+
+    // Second: resize
+    config3.sourceFormat = dstFormat;
+    config3.destFormat = dstFormat;
+    config3.filterType = filtertype;
+
+    std::unique_ptr<ImageProcess> process3(ImageProcess::create(config3));
+    auto resizeTensor3 = Tensor::create({1, oh, ow, oc}, dstType);
+    Matrix tr3;
+    tr3.postScale(fx, fy);
+    tr3.postTranslate(0.5 * (fx - 1), 0.5 * (fy - 1));
+    process3->setMatrix(tr3);
+    process3->convert(colorTensor2->host<uint8_t>(), iw, ih, 0, resizeTensor3->host<uint8_t>(), ow, oh, oc, 0, dstType);
+
+    // compare these two results
+    auto res1Ptr = colorTensor->host<uint8_t>();
+    auto res2Ptr = resizeTensor3->host<uint8_t>();
+    auto size_ = resizeTensor3->size();
+    for (int i = 0; i < (int)size_; ++i) {
+        if (res1Ptr[i] != res2Ptr[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+class ImageProcessColorResizeTest: public MNNTestCase {
+    // Test: first color then resize and first resize then color, these two results are same.
+    virtual ~ImageProcessColorResizeTest() = default;
+    virtual bool run(int precison) {
+        std::vector<Filter> filters(NEAREST, BILINEAR);
+        for (int iw = 2; iw < 200; iw += 17) {
+            for (int ih = 7; ih < 200; ih += 19) {
+                for (int ow = 2; ow < 200; ow += 17) {
+                    for (int oh = 8; oh < 240; oh += 30) {
+                        for (int f = 0; f < filters.size(); ++f) {
+                            int ic = 4;
+                            int oc = 3;
+                            bool res = funcToColorResize(iw, ih, ic, ow, oh, oc, filters[f], RGBA, RGB);
+                            if (!res) {
+                                MNN_PRINT("iw=%d, ih=%d, ic=%d, ow=%d, oh=%d, oc=%d, filtertype=%d, RGBA->RGB\n", iw, ih, ic, ow, oh, oc, filters[f]);
+                                return false;
+                            }
+                            ic = 3;
+                            oc = 4;
+                            res &= funcToColorResize(iw, ih, ic, ow, oh, oc, filters[f], RGB, RGBA);
+                            if (!res) {
+                                MNN_PRINT("iw=%d, ih=%d, ic=%d, ow=%d, oh=%d, oc=%d, filtertype=%d, RGB->RGBA\n", iw, ih, ic, ow, oh, oc, filters[f]);
+                                return false;
+                            }
+
+                        }
+
+                    }
+                }
+            }
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(ImageProcessColorResizeTest, "cv/image_process/color_resize_test");
+