Skip to content

Commit

Permalink
Merge pull request #2970 from alibaba/feature/bugfix
Browse files Browse the repository at this point in the history
Feature/bugfix
  • Loading branch information
jxt1234 authored Jul 26, 2024
2 parents 11a65d7 + 602b2e2 commit d9f7679
Show file tree
Hide file tree
Showing 10 changed files with 176 additions and 52 deletions.
13 changes: 12 additions & 1 deletion docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,15 @@ opConverter ==> MNN Converter NOT_SUPPORTED_OP: [ ANY_OP_NAME ]
临时解决方案:升级 numpy 版本到 1.20.0 或以上

## 运行问题
### 运行结果出错 / Tensor 的 elementSize 不为各维度乘积
### 运行结果出错
- 先使用 testMNNFromOnnx.py 等测试工具进行测试,具体参见模型转换工具的正确性校验部分
- 测试工具验证正确,但运行代码结果出错,可能是如下原因:
1. 使用 Session API 运行不满足运行条件的模型,此时应换用 Module API
2. 输入的内存布局不对
3. 输入数据格式不对,int64 需要换成 int32_t ,double 需要换成 float


### 布局转换问题(Tensor 的 elementSize 不为各维度乘积)
MNN 内部对 CV 相关算子采用 NC4HW4 布局,计算 elementSize 时,channel 会上对齐到 4 返回,此内存布局允许实现的硬件自行确定内存排列方式,具体方式不对用户可见,但用户可以通过如下代码,输入或获取自己指定的NCHW / NHWC 内存布局的 Tensor / VARP。

#### Interpreter-Session API
Expand Down Expand Up @@ -237,6 +245,9 @@ OpenCL / Vulkan 采用静态变量自注册的方式往 MNN 主库注册后端.
- 目前支持OpenCL和CUDA后端进行设置
- 具体可以参考:tools/cpp/testModel.cpp
### Register 相关内存泄露说明
用 valgrind 工具检查时会报 MNN Register 相关的内存泄露,这个属于一次性的初始化内存,后续也不会增加,可视为误报
## 性能相关
### 使用 GPU 时,调用 copyToHostTensor / copyFromHostTensor 非常慢
Expand Down
8 changes: 7 additions & 1 deletion docs/tools/convert.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,13 @@ model_script.save('model_script.pt')
- testMNNFromOnnx.py :适用 onnx
- testMNNFromTorch.py :适用 pt (torchscript)

注意:对于由Torchscript转换的模型,需要自行修改`testMNNFromTorch.py`中的的输入信息来测试
注意:

- 如果模型是动态输入形状,MNN 在脚本中默认不固定部分为1,有可能在 Tensorflow / OnnxRuntime / Torch 验证阶段报错。此时需要修改脚本中对应的输入部分,比如 testMNNFromOnnx.py 中的 run_onnx(self) 函数,把输入替换为有效的输入形状和内容。
- 对于由Torchscript转换的模型,一般都需要自行修改`testMNNFromTorch.py`中的的输入信息来测试。
- 如果模型输出层是 Identity 产生的,会因为 MNN 图优化的缘故丢失,此时需要校验上一层的输出,即在脚本后接输出名来测试,如: python3 ../tools/scripts/testMNNFromTf.py XXX.pb $NAME$


### 前置
- 测试 pb / tflite :安装`tensorflow`(`pip install tensorflow`
- 测试 onnx : 安装`onnxruntime`(`pip install onnxruntime`
Expand Down
46 changes: 23 additions & 23 deletions source/backend/cpu/arm/arm64/MNNSamplerC4BilinearOpt.S
Original file line number Diff line number Diff line change
Expand Up @@ -90,31 +90,31 @@ L1:
cmp x3, #0
beq End
mov v16.s[0], w4
mov v16.s[1], w5
mov v16.s[1], w5 // v16:[xMax, yMax]
mov w12, #4
mov v7.s[0], w12
mov v7.s[1], w6
mov v7.s[0], w12 // bpp=4
mov v7.s[1], w6 // yStride
dup v20.2d, x0

L1Loop:

fcvtms v2.2s, v0.2s
fcvtzs v2.2s, v0.2s // [x0, y0]
frintm v4.2s, v0.2s
smax v2.2s, v2.2s, v19.2s
fcvtps v3.2s, v0.2s
fabd v4.2s, v0.2s, v4.2s
smax v2.2s, v2.2s, v19.2s // max(0, y)
fcvtps v3.2s, v0.2s // [x1, y1]
fabd v4.2s, v0.2s, v4.2s // (xF, yF)
smax v3.2s, v3.2s, v19.2s
smin v2.2s, v2.2s, v16.2s
smin v3.2s, v3.2s, v16.2s
mul v2.2s, v2.2s, v7.2s
mul v3.2s, v3.2s, v7.2s
mov v2.s[2], v3.s[0]
mov v3.s[2], v2.s[0]
mul v2.2s, v2.2s, v7.2s // [bpp * x0, y0 * yStride]
mul v3.2s, v3.2s, v7.2s // [bpp * x1, y1 * yStride]
mov v2.s[2], v3.s[0] // v2: [bpp*x0, y0*yStride, bpp*x1, y0*yStride]
mov v3.s[2], v2.s[0] // v3: [bpp*x1, y1*yStride, bpp*x0, y1*yStride]
mov v2.s[3], v2.s[1]
mov v3.s[3], v3.s[1]

uaddlp v2.2d, v2.4s
uaddlp v3.2d, v3.4s
uaddlp v2.2d, v2.4s // [c00, c01]
uaddlp v3.2d, v3.4s // [c11, c10]

add v2.2d, v20.2d, v2.2d
add v3.2d, v20.2d, v3.2d
Expand All @@ -131,25 +131,25 @@ uxtl v6.8h, v6.8b
//Now v2, v3 is of no use

//v2: LT, v3: RT, v5: LB, v6:BT
uxtl v2.4s, v5.4h
uxtl2 v3.4s, v5.8h
uxtl v2.4s, v5.4h // c00
uxtl2 v3.4s, v5.8h // c01

ucvtf v2.4s, v2.4s
uxtl v5.4s, v6.4h
uxtl v5.4s, v6.4h // c11
ucvtf v3.4s, v3.4s
uxtl2 v6.4s, v6.8h
uxtl2 v6.4s, v6.8h // c10
ucvtf v5.4s, v5.4s
ucvtf v6.4s, v6.4s

fsub v3.4s, v3.4s, v2.4s
fsub v6.4s, v6.4s, v5.4s
fmla v2.4s, v3.4s, v4.s[0]
fmla v5.4s, v6.4s, v4.s[0]
fsub v5.4s, v5.4s, v6.4s
fmla v2.4s, v3.4s, v4.s[0] // (c01-c00)*xF+c00
fmla v6.4s, v5.4s, v4.s[0] // (c11-c10)*xF+c10

fsub v5.4s, v5.4s, v2.4s
fmla v2.4s, v5.4s, v4.s[1]
fsub v6.4s, v6.4s, v2.4s
fmla v2.4s, v6.4s, v4.s[1]

fcvtns v2.4s, v2.4s
fcvtzs v2.4s, v2.4s
uqxtn v2.4h, v2.4s
uqxtn v2.8b, v2.8h

Expand Down
10 changes: 5 additions & 5 deletions source/backend/cpu/arm/arm64/MNNSamplerC4NearestOpt.S
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,21 @@ mov v5.s[2], v3.s[1]
mov v4.s[3], v2.s[0]
mov v5.s[3], v2.s[1]

dup v23.4s, w6
dup v23.4s, w6 // yStride
movi v24.4s, #4
dup v22.2d, x0

L4Loop:
fcvtns v6.4s, v4.4s
fcvtns v7.4s, v5.4s
fcvtas v6.4s, v4.4s // x
fcvtas v7.4s, v5.4s // y

smin v6.4s, v6.4s, v16.4s
smin v7.4s, v7.4s, v17.4s
smax v6.4s, v6.4s, v19.4s
smax v7.4s, v7.4s, v19.4s

mul v7.4s, v7.4s, v23.4s
mla v7.4s, v6.4s, v24.4s
mla v7.4s, v6.4s, v24.4s // offset = y * yStride + bpp * x
uxtl v6.2d, v7.2s
uxtl2 v7.2d, v7.4s
add v6.2d, v6.2d, v22.2d
Expand Down Expand Up @@ -95,7 +95,7 @@ mov w12, #4

L1Loop:

fcvtns v2.2s, v0.2s
fcvtas v2.2s, v0.2s
smax v2.2s, v2.2s, v19.2s
smin v2.2s, v2.2s, v6.2s
mov w4, v2.s[0]
Expand Down
26 changes: 15 additions & 11 deletions source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,21 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, co
return NO_ERROR;
}

void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount) {
void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) {
auto weightDst = weight->host<uint8_t>();
memset(weightDst, 0, weight->size());
if (SRC_UNIT > UNIT) {
auto icDivU = UP_DIV(ic, UNIT);
if (SRC_UNIT > pack) {
auto icDivU = UP_DIV(ic, pack);
for (int k = 0; k < kernelCount; ++k) {
const auto srcK = weightSrc + k;
for (int y = 0; y < ic; ++y) {
const int yOutSide = y / UNIT;
const int yInSide = y % UNIT;
const int yOutSide = y / pack;
const int yInSide = y % pack;
const int yIndex = yOutSide + k * icDivU;
const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
const int ySubInSide = yIndex % (SRC_UNIT / UNIT);
const int ySubOutSide = yIndex / (SRC_UNIT / pack);
const int ySubInSide = yIndex % (SRC_UNIT / pack);

auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide;
const auto srcY = srcK + y * kernelCount;
for (int x = 0; x < oc; ++x) {
const int xOutSide = x / UNIT;
Expand Down Expand Up @@ -94,9 +94,13 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
// reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)]
int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
std::vector<int> shape;
if (SRC_UNIT > UNIT) {
int pack = gcore->pack;
if (gcore->bytes == 2 && gcore->pack == 8) {
pack = 4;
}
if (SRC_UNIT > pack) {
MNN_ASSERT(SRC_UNIT % UNIT == 0);
shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT};
} else {
shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
}
Expand All @@ -108,7 +112,7 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
MNN_ERROR("Memory not enough");
return false;
}
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount);
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack);
return true;
}

Expand Down
2 changes: 1 addition & 1 deletion source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class ConvInt8TiledExecutor : public CPUConvolution {
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount);
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack);

protected:
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
Expand Down
7 changes: 1 addition & 6 deletions source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,7 @@ void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColPara
int UNIT, SRC_UNIT, DynamicDestUnit;
auto core = int8Core;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit);
if (floatCore->bytes == 2 && DynamicDestUnit == 20) {
UNIT = 8;
SRC_UNIT= 8;
DynamicDestUnit = 10;
}
if (SRC_UNIT > UNIT) {
if (SRC_UNIT > pack) {
const auto srcCountUnit = UP_DIV(input->channel(), pack);
dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack);
dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack;
Expand Down
4 changes: 2 additions & 2 deletions source/backend/cpu/compute/GemmInt8Executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,12 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
mIm2ColParamter.padX = 0;
mIm2ColParamter.padY = 0;
mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT);
if (SRC_UNIT > UNIT___) {
if (SRC_UNIT > UNIT___ && UNIT___ == pack) {
const auto srcCountUnit = UP_DIV(input->channel(), pack);
mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
} else {
const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
mIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
}

mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT);
Expand Down
4 changes: 2 additions & 2 deletions source/backend/cpu/compute/IdstConvolutionInt8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Back
auto kernelCount = kx * ky;
auto srcCount = mSrcCount;
std::vector<int> shape;
if (SRC_UNIT > UNIT) {
if (SRC_UNIT > UNIT && UNIT == PackUnit) {
MNN_ASSERT(SRC_UNIT % UNIT == 0);
shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
} else {
Expand All @@ -81,7 +81,7 @@ IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Back
MNN_ERROR("Memory not enough\n");
return;
}
ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount);
ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount, PackUnit);
::memset(mFakeBias->host<float>(), 0, mFakeBias->size());
::memset(mFakeWeightBias->host<float>(), 0, mFakeWeightBias->size());
#ifdef MNN_USE_SSE
Expand Down
108 changes: 108 additions & 0 deletions test/cv/ImageProcessTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,3 +696,111 @@ class ImageProcessYUVBlitterTest : public ImageProcessYUVTestCommmon {
};
// {YUV_NV21, YUV_NV12, YUV_I420} -> {RGBA, RGB, BGRA, BGR, GRAY} unit test
MNNTestSuiteRegister(ImageProcessYUVBlitterTest, "cv/image_process/yuv_blitter");

static bool funcToColorResize(int iw, int ih, int ic, int ow, int oh, int oc, Filter filtertype, ImageFormat srcFormat, ImageFormat dstFormat) {
auto srcImg = genSourceData(ih, iw, ic);
auto dstType = halide_type_of<uint8_t>();

float fx = static_cast<float>(iw) / ow;
float fy = static_cast<float>(ih) / oh;
ImageProcess::Config config0, config1;

// resize first
config0.sourceFormat = srcFormat;
config0.destFormat = srcFormat;
config0.filterType = filtertype;
std::unique_ptr<ImageProcess> process0(ImageProcess::create(config0));
auto resizeTensor = Tensor::create({1, oh, ow, ic}, dstType);
Matrix tr;
tr.postScale(fx, fy);
tr.postTranslate(0.5 * (fx - 1), 0.5 * (fy - 1));
process0->setMatrix(tr);
process0->convert(srcImg.data(), iw, ih, 0, resizeTensor->host<uint8_t>(), ow, oh, ic, 0, dstType);

// then convert color
config1.sourceFormat = srcFormat;
config1.destFormat = dstFormat;
config1.filterType = filtertype;
std::unique_ptr<ImageProcess> process1(ImageProcess::create(config1));
auto colorTensor = Tensor::create({1, oh, ow, oc}, dstType);
Matrix tr1;
tr1.postScale(1.f, 1.f);
tr1.postTranslate(0, 0);
process1->setMatrix(tr1);
process1->convert(resizeTensor->host<uint8_t>(), ow, oh, 0, colorTensor->host<uint8_t>(), ow, oh, oc, 0, dstType);

// convert color first
ImageProcess::Config config2, config3;
config2.sourceFormat = srcFormat;
config2.destFormat = dstFormat;
config2.filterType = filtertype;

std::unique_ptr<ImageProcess> process2(ImageProcess::create(config2));
auto colorTensor2 = Tensor::create({1, ih, iw, oc}, dstType);
Matrix tr2;
tr2.postScale(1.f, 1.f);
tr2.postTranslate(0.f, 0.f);
process2->setMatrix(tr2);
process2->convert(srcImg.data(), iw, ih, 0, colorTensor2->host<uint8_t>(), iw, ih, oc, 0, dstType);

// Second: resize
config3.sourceFormat = dstFormat;
config3.destFormat = dstFormat;
config3.filterType = filtertype;

std::unique_ptr<ImageProcess> process3(ImageProcess::create(config3));
auto resizeTensor3 = Tensor::create({1, oh, ow, oc}, dstType);
Matrix tr3;
tr3.postScale(fx, fy);
tr3.postTranslate(0.5 * (fx - 1), 0.5 * (fy - 1));
process3->setMatrix(tr3);
process3->convert(colorTensor2->host<uint8_t>(), iw, ih, 0, resizeTensor3->host<uint8_t>(), ow, oh, oc, 0, dstType);

// compare these two results
auto res1Ptr = colorTensor->host<uint8_t>();
auto res2Ptr = resizeTensor3->host<uint8_t>();
auto size_ = resizeTensor3->size();
for (int i = 0; i < (int)size_; ++i) {
if (res1Ptr[i] != res2Ptr[i]) {
return false;
}
}
return true;
}

class ImageProcessColorResizeTest: public MNNTestCase {
// Test: first color then resize and first resize then color, these two results are same.
virtual ~ImageProcessColorResizeTest() = default;
virtual bool run(int precison) {
std::vector<Filter> filters(NEAREST, BILINEAR);
for (int iw = 2; iw < 200; iw += 17) {
for (int ih = 7; ih < 200; ih += 19) {
for (int ow = 2; ow < 200; ow += 17) {
for (int oh = 8; oh < 240; oh += 30) {
for (int f = 0; f < filters.size(); ++f) {
int ic = 4;
int oc = 3;
bool res = funcToColorResize(iw, ih, ic, ow, oh, oc, filters[f], RGBA, RGB);
if (!res) {
MNN_PRINT("iw=%d, ih=%d, ic=%d, ow=%d, oh=%d, oc=%d, filtertype=%d, RGBA->RGB\n", iw, ih, ic, ow, oh, oc, filters[f]);
return false;
}
ic = 3;
oc = 4;
res &= funcToColorResize(iw, ih, ic, ow, oh, oc, filters[f], RGB, RGBA);
if (!res) {
MNN_PRINT("iw=%d, ih=%d, ic=%d, ow=%d, oh=%d, oc=%d, filtertype=%d, RGB->RGBA\n", iw, ih, ic, ow, oh, oc, filters[f]);
return false;
}

}

}
}
}
}
return true;
}
};
MNNTestSuiteRegister(ImageProcessColorResizeTest, "cv/image_process/color_resize_test");

0 comments on commit d9f7679

Please sign in to comment.