From bed0bbbdad93538e7749f36399a2bf8b40a9db8b Mon Sep 17 00:00:00 2001
From: zhangwm <zhangwenmeng@linux.alibaba.com>
Date: Thu, 31 Aug 2023 17:11:22 +0800
Subject: [PATCH] SHL: version 2.6.0

---
 CMakeLists.txt                                |    16 +
 Makefile                                      |     3 +
 cmake/c906_elf.cmake                          |    24 +-
 cmake/c906_share.cmake                        |    24 +-
 cmake/c906_static.cmake                       |    26 +-
 cmake/c908.cmake                              |    24 +-
 cmake/c920.cmake                              |    34 +-
 cmake/c920v2.cmake                            |   421 +
 cmake/rules.cmake                             |    24 +-
 cmake/rvm.cmake                               |    24 +-
 cmake/rvv.cmake                               |    24 +-
 include/{shl_c906.h => backend/c906/c906.h}   |     6 +-
 .../backend/c906/cap.h                        |     0
 include/{shl_c908.h => backend/c908/c908.h}   |    16 +-
 include/{shl_c920.h => backend/c920/c920.h}   |     9 +-
 .../backend/c920/cap.h                        |     0
 include/backend/c920v2/c920v2.h               |   129 +
 include/backend/c920v2/cap.h                  |    29 +
 include/{shl_e907.h => backend/e907/e907.h}   |     2 +-
 include/backend/pnna/pnna.h                   |   274 +
 include/backend/pnna/wrapper.h                |   172 +
 .../{shl_ref.h => backend/reference/ref.h}    |     4 -
 .../{shl_thead_rvm.h => backend/rvm/rvm.h}    |     2 +-
 .../backend/rvv/cap.h                         |    13 +
 .../{shl_thead_rvv.h => backend/rvv/rvv.h}    |   325 +-
 include/{ => backend/tvmgen}/shl_tvmgen.h     |     2 +-
 include/{ => csinn}/csi_nn.h                  |     2 -
 include/{ => csinn}/csinn_data_structure.h    |    11 +
 include/{ => csinn}/csinn_runtime.h           |     0
 include/{ => graph}/shl_gref.h                |     1 +
 include/{ => graph}/shl_node.h                |     0
 include/shl_debug.h                           |     4 +-
 include/shl_public/shl_c906.h                 |    66 +
 include/shl_public/shl_c920.h                 |    50 +
 include/shl_public/shl_pnna.h                 |    27 +
 include/shl_public/shl_ref.h                  |    46 +
 include/shl_utils.h                           |    11 +-
 module/json/json.hpp                          | 25447 ++++++++++++++++
 source/c906_opt/Kconfig                       |    20 +-
 source/c906_opt/capability.c                  |    21 +-
 source/c906_opt/fp16/abs.c                    |     2 +-
 source/c906_opt/fp16/add.c                    |     2 +-
 source/c906_opt/fp16/avgpool.c                |     2 +-
 source/c906_opt/fp16/cache_conv1d.c           |     2 +-
 source/c906_opt/fp16/cache_matmul.c           |     2 +-
 source/c906_opt/fp16/clip.c                   |     2 +-
 source/c906_opt/fp16/concat.c                 |     2 +-
 source/c906_opt/fp16/convolution.c            |    42 +-
 source/c906_opt/fp16/convolution1d.c          |     6 +-
 source/c906_opt/fp16/convolution_1x1_fp16.c   |    16 +-
 source/c906_opt/fp16/convolution_3x3_fp16.c   |     2 +-
 source/c906_opt/fp16/convolution_gemm_fp16.c  |   146 +-
 source/c906_opt/fp16/depthwise_convolution.c  |     2 +-
 .../c906_opt/fp16/depthwise_convolution1d.c   |    38 +-
 .../fp16/depthwise_convolution_3x3_fp16.c     |    66 +-
 .../depthwise_convolution_3x3_pack8_fp16.c    |     2 +-
 .../fp16/depthwise_convolution_fp16.c         |     2 +-
 source/c906_opt/fp16/div.c                    |     2 +-
 source/c906_opt/fp16/fullyconnected.c         |    37 +-
 source/c906_opt/fp16/gemm_fp16.c              |     2 +-
 source/c906_opt/fp16/gemv_fp16.c              |     2 +-
 source/c906_opt/fp16/global_avgpool.c         |     2 +-
 source/c906_opt/fp16/global_maxpool.c         |     2 +-
 source/c906_opt/fp16/leaky_relu.c             |     2 +-
 source/c906_opt/fp16/lrn.c                    |     2 +-
 source/c906_opt/fp16/matmul.c                 |    66 +-
 source/c906_opt/fp16/maxpool.c                |     2 +-
 source/c906_opt/fp16/minimum.c                |     2 +-
 source/c906_opt/fp16/mul.c                    |     2 +-
 source/c906_opt/fp16/pad.c                    |     2 +-
 source/c906_opt/fp16/prelu.c                  |     2 +-
 source/c906_opt/fp16/reduce_sum.c             |     2 +-
 source/c906_opt/fp16/relu.c                   |     2 +-
 source/c906_opt/fp16/relu1.c                  |     2 +-
 source/c906_opt/fp16/relu6.c                  |     2 +-
 source/c906_opt/fp16/reshape.c                |     2 +-
 source/c906_opt/fp16/split.c                  |     2 +-
 source/c906_opt/fp16/sub.c                    |     2 +-
 source/c906_opt/fp32/abs.c                    |     2 +-
 source/c906_opt/fp32/add.c                    |     2 +-
 source/c906_opt/fp32/avgpool.c                |     2 +-
 source/c906_opt/fp32/broadcast_to.c           |     2 +-
 source/c906_opt/fp32/clip.c                   |     2 +-
 source/c906_opt/fp32/concat.c                 |     2 +-
 source/c906_opt/fp32/convolution.c            |    12 +-
 source/c906_opt/fp32/convolution1d.c          |     6 +-
 source/c906_opt/fp32/convolution_1x1_fp32.c   |     2 +-
 source/c906_opt/fp32/convolution_3x3_fp32.c   |     2 +-
 source/c906_opt/fp32/convolution_sgemm_fp32.c |     2 +-
 source/c906_opt/fp32/depthwise_convolution.c  |     2 +-
 .../fp32/depthwise_convolution_3x3_fp32.c     |     2 +-
 .../depthwise_convolution_3x3_pack4_fp32.c    |     2 +-
 .../fp32/depthwise_convolution_5x5_fp32.c     |     2 +-
 source/c906_opt/fp32/div.c                    |     2 +-
 source/c906_opt/fp32/gemm_fp32.c              |     2 +-
 source/c906_opt/fp32/global_avgpool.c         |     2 +-
 source/c906_opt/fp32/global_maxpool.c         |     2 +-
 source/c906_opt/fp32/leaky_relu.c             |     2 +-
 source/c906_opt/fp32/matmul.c                 |    26 +-
 source/c906_opt/fp32/maxpool.c                |     2 +-
 source/c906_opt/fp32/minimum.c                |     2 +-
 source/c906_opt/fp32/mul.c                    |     2 +-
 source/c906_opt/fp32/pad.c                    |     2 +-
 source/c906_opt/fp32/prelu.c                  |     2 +-
 source/c906_opt/fp32/relu.c                   |     2 +-
 source/c906_opt/fp32/relu1.c                  |     2 +-
 source/c906_opt/fp32/relu6.c                  |     2 +-
 source/c906_opt/fp32/split.c                  |     2 +-
 source/c906_opt/fp32/sub.c                    |     2 +-
 source/c906_opt/hpm.c                         |    94 +-
 source/c906_opt/setup.c                       |    10 +-
 source/c906_opt/utils.c                       |   391 +-
 source/c908_opt/CMakeLists.txt                |    50 +-
 source/c908_opt/fp16/avgpool.c                |     4 +-
 source/c908_opt/fp16/convolution.c            |    83 +-
 source/c908_opt/fp16/convolution_1x1_fp16.c   |    56 +-
 .../fp16/convolution_1x1_fp16_pack1ton.c      |    61 +-
 .../fp16/convolution_1x1_fp16_packn.c         |    53 +-
 .../fp16/convolution_1x1_fp16_packnto1.c      |    51 +-
 source/c908_opt/fp16/convolution_3x3_fp16.c   |     2 +-
 .../fp16/convolution_3x3_fp16_packn.c         |     2 +-
 .../fp16/convolution_3x3_fp16_packn_1.c       |     2 +-
 source/c908_opt/fp16/convolution_gemm_fp16.c  |    98 +-
 .../fp16/convolution_gemm_fp16_pack1ton.c     |   110 +-
 .../fp16/convolution_gemm_fp16_packn.c        |   103 +-
 .../fp16/convolution_gemm_fp16_packnto1.c     |    97 +-
 source/c908_opt/fp16/depthwise_convolution.c  |     5 +-
 source/c908_opt/fp16/fullyconnected.c         |     2 +-
 source/c908_opt/fp16/gemm_fp16.c              |     2 +-
 source/c908_opt/fp16/gemm_fp16_ncxhwx.S       |   454 +-
 source/c908_opt/fp16/gemm_fp16_packn.c        |     4 +-
 source/c908_opt/fp16/gemm_fp16_v256.c         |     2 +-
 source/c908_opt/fp16/maxpool.c                |     4 +-
 source/c908_opt/fp32/avgpool.c                |     4 +-
 source/c908_opt/fp32/convolution.c            |    27 +-
 source/c908_opt/fp32/convolution_1x1_fp32.c   |    54 +-
 .../fp32/convolution_1x1_fp32_pack1ton.c      |    61 +-
 .../fp32/convolution_1x1_fp32_packn.c         |    52 +-
 .../fp32/convolution_1x1_fp32_packnto1.c      |    49 +-
 source/c908_opt/fp32/convolution_3x3_fp32.c   |     2 +-
 .../fp32/convolution_3x3_fp32_packn.c         |     2 +-
 .../fp32/convolution_3x3_fp32_packn_1.c       |     2 +-
 source/c908_opt/fp32/convolution_gemm_fp32.c  |    96 +-
 .../fp32/convolution_gemm_fp32_pack1ton.c     |   108 +-
 .../fp32/convolution_gemm_fp32_packn.c        |   104 +-
 .../fp32/convolution_gemm_fp32_packnto1.c     |    96 +-
 source/c908_opt/fp32/depthwise_convolution.c  |     5 +-
 source/c908_opt/fp32/fullyconnected.c         |     2 +-
 source/c908_opt/fp32/gemm_fp32.c              |     2 +-
 source/c908_opt/fp32/gemm_fp32_ncxhwx.S       |   464 +-
 source/c908_opt/fp32/gemm_fp32_packn.c        |     6 +-
 source/c908_opt/fp32/gemm_fp32_v256.c         |     2 +-
 source/c908_opt/fp32/maxpool.c                |     4 +-
 source/c908_opt/int4/convolution.c            |    10 +-
 source/c908_opt/int4/depthwise_convolution.c  |     2 +-
 source/c908_opt/int4/fullyconnected.c         |     2 +-
 source/c908_opt/int4/gemm_int4_dot_ncxhwx.S   |   200 +
 source/c908_opt/int8/avgpool.c                |     4 +-
 source/c908_opt/int8/convolution.c            |    27 +-
 source/c908_opt/int8/convolution_1x1_int8.c   |    78 +-
 .../int8/convolution_1x1_int8_pack1ton.c      |   108 +-
 .../int8/convolution_1x1_int8_packn.c         |    70 +-
 .../int8/convolution_1x1_int8_packnto1.c      |    70 +-
 source/c908_opt/int8/convolution_3x3_int8.c   |     2 +-
 .../int8/convolution_3x3_int8_packn.c         |     2 +-
 .../int8/convolution_3x3_int8_packn_1.c       |     2 +-
 source/c908_opt/int8/convolution_gemm_int8.c  |   121 +-
 .../int8/convolution_gemm_int8_pack1ton.c     |   168 +-
 .../int8/convolution_gemm_int8_packn.c        |   126 +-
 .../int8/convolution_gemm_int8_packnto1.c     |   123 +-
 source/c908_opt/int8/depthwise_convolution.c  |     5 +-
 source/c908_opt/int8/fullyconnected.c         |     2 +-
 source/c908_opt/int8/gemm_int16_ncxhwx.S      |   130 +
 source/c908_opt/int8/gemm_int16_packn.c       |     2 +-
 source/c908_opt/int8/gemm_int8_dot.c          |     2 +-
 source/c908_opt/int8/gemm_int8_dot_ncxhwx.S   |   276 +-
 source/c908_opt/int8/gemm_int8_dot_packn.c    |     6 +-
 source/c908_opt/int8/gemm_int8_dot_v256.c     |     2 +-
 source/c908_opt/int8/gemm_int8_ncxhwx.S       |   214 +-
 source/c908_opt/int8/gemm_int8_packn.c        |     4 +-
 source/c908_opt/int8/maxpool.c                |     4 +-
 source/c908_opt/reorder.c                     |     2 +-
 source/c908_opt/setup.c                       |     2 +-
 source/c920_opt/CMakeLists.txt                |    47 +
 source/c920_opt/Kconfig                       |    52 +
 source/c920_opt/capability.c                  |    15 +-
 source/c920_opt/convolution.c                 |   281 -
 source/c920_opt/convolution_1x1_fp16_packn.c  |    73 -
 source/c920_opt/convolution_1x1_fp32_packn.c  |    71 -
 source/c920_opt/convolution_gemm_fp16_packn.c |   121 -
 source/c920_opt/convolution_gemm_fp32_packn.c |   120 -
 source/c920_opt/fp16/convolution.c            |   196 +
 .../fp16/convolution_1x1_fp16_packn.c         |    28 +
 .../{ => fp16}/convolution_3x3_fp16_packn.c   |     2 +-
 .../fp16/convolution_gemm_fp16_packn.c        |    28 +
 source/c920_opt/{ => fp16}/gemm_fp16_block.c  |    31 +-
 source/c920_opt/{ => fp16}/gemm_fp16_packn.c  |     5 +-
 source/c920_opt/{ => fp16}/matmul_fp16.c      |    74 +-
 source/c920_opt/fp32/convolution.c            |   150 +
 .../fp32/convolution_1x1_fp32_packn.c         |    28 +
 .../{ => fp32}/convolution_3x3_fp32_packn.c   |     2 +-
 .../fp32/convolution_gemm_fp32_packn.c        |    28 +
 source/c920_opt/{ => fp32}/gemm_fp32_block.c  |    16 +-
 source/c920_opt/{ => fp32}/gemm_fp32_packn.c  |     5 +-
 source/c920_opt/{ => fp32}/matmul_fp32.c      |    48 +-
 source/c920_opt/reorder.c                     |    18 +-
 source/c920_opt/setup.c                       |    20 +-
 source/c920_opt/utils.c                       |     2 +-
 source/c920_opt/yolov5.c                      |     2 +-
 source/c920_opt/yolox.c                       |    97 +
 source/c920v2_opt/CMakeLists.txt              |    50 +
 source/c920v2_opt/Kconfig                     |    52 +
 source/c920v2_opt/capability.c                |    36 +
 source/c920v2_opt/fp16/convolution.c          |   186 +
 .../fp16/convolution_1x1_fp16_pack1ton.c      |    28 +
 .../fp16/convolution_1x1_fp16_packn.c         |    28 +
 .../fp16/convolution_1x1_fp16_packnto1.c      |    28 +
 .../fp16/convolution_gemm_fp16_pack1ton.c     |    30 +
 .../fp16/convolution_gemm_fp16_packn.c        |    28 +
 .../fp16/convolution_gemm_fp16_packnto1.c     |    30 +
 source/c920v2_opt/fp16/gemm_fp16_ncxhwx.S     |    21 +
 source/c920v2_opt/fp16/gemm_fp16_packn.c      |    52 +
 source/c920v2_opt/fp32/convolution.c          |   149 +
 .../fp32/convolution_1x1_fp32_pack1ton.c      |    28 +
 .../fp32/convolution_1x1_fp32_packn.c         |    28 +
 .../fp32/convolution_1x1_fp32_packnto1.c      |    28 +
 .../fp32/convolution_gemm_fp32_pack1ton.c     |    30 +
 .../fp32/convolution_gemm_fp32_packn.c        |    28 +
 .../fp32/convolution_gemm_fp32_packnto1.c     |    30 +
 source/c920v2_opt/fp32/gemm_fp32_ncxhwx.S     |    21 +
 source/c920v2_opt/fp32/gemm_fp32_packn.c      |    52 +
 source/c920v2_opt/int8/convolution.c          |   203 +
 .../int8/convolution_1x1_int8_pack1ton.c      |    34 +
 .../int8/convolution_1x1_int8_packn.c         |    34 +
 .../int8/convolution_1x1_int8_packnto1.c      |    34 +
 source/c920v2_opt/int8/gemm_int8_dot_ncxhwx.S |    21 +
 source/c920v2_opt/int8/gemm_int8_dot_packn.c  |    47 +
 source/c920v2_opt/int8/gemm_int8_ncxhwx.S     |    21 +
 source/c920v2_opt/int8/gemm_int8_packn.c      |    52 +
 source/c920v2_opt/setup.c                     |   402 +
 source/c920v2_opt/utils.c                     |    35 +
 source/e907_opt/concat.c                      |     2 +-
 source/e907_opt/convolution.c                 |     2 +-
 source/e907_opt/fullyconnected.c              |     2 +-
 source/e907_opt/fullyconnected_int8.c         |     2 +-
 source/e907_opt/mul.c                         |     2 +-
 source/e907_opt/relu.c                        |     2 +-
 source/e907_opt/setup.c                       |     2 +-
 source/e907_opt/softmax.c                     |     2 +-
 source/e907_opt/sum.c                         |     2 +-
 source/e907_opt/utils.c                       |     2 +-
 source/graph_ref/avgpool3d.c                  |    16 +-
 source/graph_ref/batch_to_space.c             |     6 +-
 source/graph_ref/batch_to_space_nd.c          |     1 +
 source/graph_ref/concat.c                     |     4 +-
 source/graph_ref/convolution.c                |    15 +-
 source/graph_ref/convolution1d.c              |    21 +-
 source/graph_ref/convolution3d.c              |    17 +-
 source/graph_ref/deconvolution.c              |    16 +-
 source/graph_ref/deconvolution3d.c            |    12 +-
 source/graph_ref/depth_to_space.c             |     5 +-
 source/graph_ref/expand_dims.c                |     1 +
 source/graph_ref/flatten.c                    |     1 +
 source/graph_ref/fullyconnected.c             |     1 +
 source/graph_ref/gather.c                     |     1 +
 source/graph_ref/im2col.c                     |     6 +-
 source/graph_ref/instance_norm.c              |     1 +
 source/graph_ref/matmul.c                     |    20 +-
 source/graph_ref/one_hot.c                    |     1 +
 source/graph_ref/pad.c                        |    11 +-
 source/graph_ref/sequence_mask.c              |     2 +
 source/graph_ref/setup.c                      |    66 +-
 source/graph_ref/shape.c                      |     1 +
 source/graph_ref/slice.c                      |     1 +
 source/graph_ref/space_to_batch.c             |     2 +
 source/graph_ref/space_to_batch_nd.c          |     1 +
 source/graph_ref/space_to_depth.c             |     2 +
 source/graph_ref/split.c                      |     1 +
 source/graph_ref/squeeze.c                    |     1 +
 source/graph_ref/strided_slice.c              |     1 +
 source/graph_ref/subgraph.c                   |     1 +
 source/graph_ref/tile.c                       |     1 +
 source/graph_ref/topk.c                       |     1 +
 source/graph_ref/transpose.c                  |     1 +
 source/graph_ref/utils.c                      |    67 +-
 source/graph_ref/where.c                      |     4 +
 source/graph_ref/where_softmax.c              |     2 +
 source/nn2/format.c                           |   342 +-
 source/nn2/setup.c                            |     4 +
 source/nn2/utils.c                            |   170 +-
 source/reference/abs.c                        |     2 +-
 source/reference/acos.c                       |     2 +-
 source/reference/acosh.c                      |     2 +-
 source/reference/add.c                        |     2 +-
 source/reference/and.c                        |     2 +-
 source/reference/arange.c                     |     2 +-
 source/reference/argmax.c                     |     2 +-
 source/reference/argmin.c                     |     2 +-
 source/reference/asin.c                       |     2 +-
 source/reference/asinh.c                      |     2 +-
 source/reference/atan.c                       |     2 +-
 source/reference/atanh.c                      |     2 +-
 source/reference/averagepool.c                |     2 +-
 source/reference/averagepool3d.c              |     2 +-
 source/reference/batch_normalization.c        |     2 +-
 source/reference/batch_to_space.c             |     2 +-
 source/reference/broadcast_to.c               |     2 +-
 source/reference/cache_conv1d.c               |     2 +-
 source/reference/cache_matmul.c               |     2 +-
 source/reference/cast.c                       |     2 +-
 source/reference/ceil.c                       |     2 +-
 source/reference/clip.c                       |     2 +-
 source/reference/col2im.c                     |     2 +-
 source/reference/concat.c                     |     2 +-
 source/reference/convolution.c                |     2 +-
 source/reference/convolution1d.c              |     2 +-
 source/reference/convolution3d.c              |     2 +-
 source/reference/convolution_channel.c        |     2 +-
 source/reference/convolution_relu.c           |     2 +-
 source/reference/convolution_relu6.c          |     2 +-
 source/reference/cos.c                        |     2 +-
 source/reference/cosh.c                       |     2 +-
 source/reference/cumprod.c                    |     2 +-
 source/reference/cumsum.c                     |     2 +-
 source/reference/data_convert.c               |     2 +-
 source/reference/deconvolution.c              |     2 +-
 source/reference/deconvolution3d.c            |     2 +-
 source/reference/depth_to_space.c             |     2 +-
 source/reference/div.c                        |     2 +-
 source/reference/elu.c                        |     2 +-
 source/reference/equal.c                      |     2 +-
 source/reference/erf.c                        |     2 +-
 source/reference/exp.c                        |     2 +-
 source/reference/expand_dims.c                |     2 +-
 source/reference/expm1.c                      |     2 +-
 source/reference/flatten.c                    |     2 +-
 source/reference/floor.c                      |     2 +-
 source/reference/floor_divide.c               |     2 +-
 source/reference/floor_mod.c                  |     2 +-
 source/reference/fsmn.c                       |     2 +-
 source/reference/fullyconnected.c             |     2 +-
 source/reference/gather.c                     |     2 +-
 source/reference/gather_nd.c                  |     2 +-
 source/reference/global_averagepool.c         |     2 +-
 source/reference/global_maxpool.c             |     2 +-
 source/reference/greater.c                    |     2 +-
 source/reference/greater_equal.c              |     2 +-
 source/reference/hard_sigmoid.c               |     2 +-
 source/reference/im2col.c                     |     2 +-
 source/reference/instance_norm.c              |     2 +-
 source/reference/isnan.c                      |     2 +-
 source/reference/l2_normalization.c           |     2 +-
 source/reference/l2pool.c                     |     2 +-
 source/reference/layer_norm.c                 |     2 +-
 source/reference/leaky_relu.c                 |     2 +-
 source/reference/less.c                       |     2 +-
 source/reference/less_equal.c                 |     2 +-
 source/reference/log.c                        |     2 +-
 source/reference/log1p.c                      |     2 +-
 source/reference/log_softmax.c                |     2 +-
 source/reference/logical_and.c                |     2 +-
 source/reference/logical_not.c                |     2 +-
 source/reference/logical_or.c                 |     2 +-
 source/reference/logical_xor.c                |     2 +-
 source/reference/lrn.c                        |     2 +-
 source/reference/matmul.c                     |     2 +-
 source/reference/max.c                        |     2 +-
 source/reference/maximum.c                    |     2 +-
 source/reference/maxpool.c                    |     2 +-
 source/reference/maxpool2d_locat.c            |     2 +-
 source/reference/maxpool3d.c                  |     2 +-
 source/reference/mean.c                       |     2 +-
 source/reference/min.c                        |     2 +-
 source/reference/minimum.c                    |     2 +-
 source/reference/mod.c                        |     2 +-
 source/reference/mul.c                        |     2 +-
 source/reference/ndarray_size.c               |     2 +-
 source/reference/negative.c                   |     2 +-
 source/reference/non_max_suppression.c        |     2 +-
 source/reference/not.c                        |     2 +-
 source/reference/not_equal.c                  |     2 +-
 source/reference/one_hot.c                    |     2 +-
 source/reference/or.c                         |     2 +-
 source/reference/pad.c                        |     2 +-
 source/reference/power.c                      |     2 +-
 source/reference/prelu.c                      |     2 +-
 source/reference/prod.c                       |     2 +-
 source/reference/proposal.c                   |     2 +-
 source/reference/psroipooling.c               |     2 +-
 source/reference/reduce_logsumexp.c           |     2 +-
 source/reference/reduce_max.c                 |     2 +-
 source/reference/reduce_mean.c                |     2 +-
 source/reference/reduce_min.c                 |     2 +-
 source/reference/reduce_prod.c                |     2 +-
 source/reference/reduce_sum.c                 |     2 +-
 source/reference/relu.c                       |     2 +-
 source/reference/relu1.c                      |     2 +-
 source/reference/relu6.c                      |     2 +-
 source/reference/relun.c                      |     2 +-
 source/reference/reshape.c                    |     2 +-
 source/reference/resize.c                     |     2 +-
 source/reference/reverse.c                    |     2 +-
 source/reference/roialign.c                   |     2 +-
 source/reference/roipool.c                    |     2 +-
 source/reference/round.c                      |     2 +-
 source/reference/rsqrt.c                      |     2 +-
 source/reference/scatter.c                    |     2 +-
 source/reference/segment_max.c                |     2 +-
 source/reference/segment_mean.c               |     2 +-
 source/reference/segment_min.c                |     2 +-
 source/reference/segment_prod.c               |     2 +-
 source/reference/segment_sum.c                |     2 +-
 source/reference/select.c                     |     2 +-
 source/reference/setup.c                      |     6 +-
 source/reference/shape.c                      |     2 +-
 source/reference/shuffle_channel.c            |     2 +-
 source/reference/sigmoid.c                    |     2 +-
 source/reference/sign.c                       |     2 +-
 source/reference/sin.c                        |     2 +-
 source/reference/sinh.c                       |     2 +-
 source/reference/slice.c                      |     2 +-
 source/reference/softmax.c                    |     2 +-
 source/reference/softplus.c                   |     2 +-
 source/reference/softrelu.c                   |     2 +-
 source/reference/softsign.c                   |     2 +-
 source/reference/space_to_batch.c             |     2 +-
 source/reference/space_to_depth.c             |     2 +-
 source/reference/split.c                      |     2 +-
 source/reference/sqrt.c                       |     2 +-
 source/reference/square.c                     |     2 +-
 source/reference/squeeze.c                    |     2 +-
 source/reference/stack.c                      |     2 +-
 source/reference/strided_slice.c              |     4 +-
 source/reference/sub.c                        |     2 +-
 source/reference/sum.c                        |     2 +-
 source/reference/tan.c                        |     2 +-
 source/reference/tanh.c                       |     2 +-
 source/reference/threshold_relu.c             |     2 +-
 source/reference/tile.c                       |     2 +-
 source/reference/topk.c                       |     2 +-
 source/reference/transpose.c                  |     2 +-
 source/reference/trunc.c                      |     2 +-
 source/reference/unpooling.c                  |     2 +-
 source/reference/unstack.c                    |     2 +-
 source/reference/utils.c                      |     2 +-
 source/reference/where.c                      |     2 +-
 source/reference/where_softmax.c              |     2 +-
 source/reference/xor.c                        |     2 +-
 source/reference/yuv_rgb_scale.c              |     2 +-
 source/thead_matrix/avgpool.c                 |     2 +-
 source/thead_matrix/convolution.c             |    20 +-
 .../convolution_1x1_fp16_matrix.c             |     2 +-
 .../convolution_1x1_int8_matrix.c             |     2 +-
 .../convolution_3x3_fp16_matrix.c             |     2 +-
 .../convolution_gemm_fp16_matrix.c            |     2 +-
 .../convolution_gemm_int8_matrix.c            |     2 +-
 source/thead_matrix/depthwise_convolution.c   |     2 +-
 source/thead_matrix/gemm_fp16_matrix.c        |     2 +-
 .../thead_matrix/gemm_fp16_matrix_intrinsic.c |     5 +-
 source/thead_matrix/gemm_int8_matrix.c        |     2 +-
 source/thead_matrix/maxpool.c                 |     2 +-
 source/thead_matrix/setup.c                   |     2 +-
 source/thead_matrix/utils.c                   |     2 +-
 source/thead_rvv/CMakeLists.txt               |   103 +-
 source/thead_rvv/Kconfig                      |   157 +-
 source/thead_rvv/binary_broadcast.c           |   569 +
 source/thead_rvv/capability.c                 |   178 +-
 source/thead_rvv/data_convert.c               |     2 +-
 source/thead_rvv/fp16/add.c                   |    44 +-
 source/thead_rvv/fp16/avgpool.c               |     4 +-
 source/thead_rvv/fp16/avgpool_2x2_fp16.c      |     2 +-
 .../thead_rvv/fp16/avgpool_2x2_fp16_packn.c   |     2 +-
 source/thead_rvv/fp16/avgpool_3x3_fp16.c      |     2 +-
 .../thead_rvv/fp16/avgpool_3x3_fp16_packn.c   |     2 +-
 source/thead_rvv/fp16/avgpool_fp16_nhwc.c     |     2 +-
 source/thead_rvv/fp16/avgpool_fp16_packn.c    |     2 +-
 source/thead_rvv/fp16/clip.c                  |     2 +-
 source/thead_rvv/fp16/concat.c                |     6 +-
 source/thead_rvv/fp16/convolution.c           |    87 +-
 source/thead_rvv/fp16/convolution1d.c         |    51 +
 .../thead_rvv/fp16/convolution1d_gemm_fp16.c  |   237 +
 source/thead_rvv/fp16/convolution_1x1_fp16.c  |    66 +-
 .../fp16/convolution_1x1_fp16_pack1ton.c      |    60 +-
 .../fp16/convolution_1x1_fp16_packn.c         |    57 +-
 .../fp16/convolution_1x1_fp16_packnto1.c      |    57 +-
 .../fp16/convolution_3x3_fp16_packn.c         |     2 +-
 .../thead_rvv/fp16/convolution_direct_fp16.c  |     2 +-
 source/thead_rvv/fp16/convolution_gemm_fp16.c |   153 +-
 .../fp16/convolution_gemm_fp16_pack1ton.c     |   157 +-
 .../fp16/convolution_gemm_fp16_packn.c        |   183 +-
 .../fp16/convolution_gemm_fp16_packnto1.c     |   219 +-
 source/thead_rvv/fp16/deconvolution.c         |    49 +
 .../thead_rvv/fp16/deconvolution_gemm_fp16.c  |   324 +
 source/thead_rvv/fp16/depthwise_convolution.c |    11 +-
 .../fp16/depthwise_convolution_3x3_fp16.c     |    66 +-
 .../depthwise_convolution_3x3_fp16_packn.c    |   114 +-
 .../fp16/depthwise_convolution_fp16_nhwc.c    |     2 +-
 .../fp16/depthwise_convolution_fp16_packn.c   |    46 +-
 source/thead_rvv/fp16/div.c                   |    72 +
 source/thead_rvv/fp16/erf.c                   |    86 +
 source/thead_rvv/fp16/fullyconnected.c        |     2 +-
 source/thead_rvv/fp16/fullyconnected_fp16.c   |    24 +-
 source/thead_rvv/fp16/gather.c                |     6 +-
 source/thead_rvv/fp16/gemm_fp16.c             |     2 +-
 source/thead_rvv/fp16/gemm_fp16_block.c       |    21 +-
 source/thead_rvv/fp16/gemm_fp16_packn.c       |     8 +-
 source/thead_rvv/fp16/global_avgpool.c        |     2 +-
 source/thead_rvv/fp16/global_avgpool_nhwc.c   |     2 +-
 source/thead_rvv/fp16/global_avgpool_packn.c  |     2 +-
 source/thead_rvv/fp16/global_maxpool.c        |     2 +-
 source/thead_rvv/fp16/global_maxpool_nhwc.c   |     2 +-
 source/thead_rvv/fp16/global_maxpool_packn.c  |     2 +-
 source/thead_rvv/fp16/layer_norm.c            |     2 +-
 source/thead_rvv/fp16/leaky_relu.c            |     2 +-
 source/thead_rvv/fp16/matmul.c                |    78 +-
 source/thead_rvv/fp16/maxpool.c               |     4 +-
 source/thead_rvv/fp16/maxpool_2x2_fp16.c      |     2 +-
 .../thead_rvv/fp16/maxpool_2x2_fp16_packn.c   |     2 +-
 source/thead_rvv/fp16/maxpool_3x3_fp16.c      |     2 +-
 .../thead_rvv/fp16/maxpool_3x3_fp16_packn.c   |     2 +-
 source/thead_rvv/fp16/maxpool_fp16_nhwc.c     |     2 +-
 source/thead_rvv/fp16/maxpool_fp16_packn.c    |     2 +-
 source/thead_rvv/fp16/mul.c                   |    44 +-
 source/thead_rvv/fp16/pad.c                   |     2 +-
 source/thead_rvv/fp16/prelu.c                 |     2 +-
 source/thead_rvv/fp16/relu.c                  |     2 +-
 source/thead_rvv/fp16/relu6.c                 |     2 +-
 source/thead_rvv/fp16/reshape.c               |     4 +-
 source/thead_rvv/fp16/sigmoid.c               |     2 +-
 source/thead_rvv/fp16/softmax.c               |     2 +-
 source/thead_rvv/fp16/strided_slice.c         |     8 +-
 source/thead_rvv/fp16/sub.c                   |    72 +
 source/thead_rvv/fp16/transpose.c             |     7 +-
 source/thead_rvv/fp32/add.c                   |    44 +-
 source/thead_rvv/fp32/avgpool.c               |     4 +-
 source/thead_rvv/fp32/avgpool_2x2_fp32.c      |     2 +-
 .../thead_rvv/fp32/avgpool_2x2_fp32_packn.c   |     2 +-
 source/thead_rvv/fp32/avgpool_3x3_fp32.c      |     2 +-
 .../thead_rvv/fp32/avgpool_3x3_fp32_packn.c   |     2 +-
 source/thead_rvv/fp32/avgpool_fp32_nhwc.c     |     2 +-
 source/thead_rvv/fp32/avgpool_fp32_packn.c    |     2 +-
 source/thead_rvv/fp32/clip.c                  |     2 +-
 source/thead_rvv/fp32/concat.c                |     2 +-
 source/thead_rvv/fp32/convolution.c           |    27 +-
 source/thead_rvv/fp32/convolution1d.c         |    47 +
 .../thead_rvv/fp32/convolution1d_gemm_fp32.c  |   113 +
 source/thead_rvv/fp32/convolution_1x1_fp32.c  |    34 +-
 .../fp32/convolution_1x1_fp32_pack1ton.c      |    23 +-
 .../fp32/convolution_1x1_fp32_packn.c         |    24 +-
 .../fp32/convolution_1x1_fp32_packnto1.c      |    24 +-
 .../fp32/convolution_3x3_fp32_packn.c         |     2 +-
 source/thead_rvv/fp32/convolution_gemm_fp32.c |    30 +-
 .../fp32/convolution_gemm_fp32_pack1ton.c     |    24 +-
 .../fp32/convolution_gemm_fp32_packn.c        |    23 +-
 .../fp32/convolution_gemm_fp32_packnto1.c     |    23 +-
 source/thead_rvv/fp32/deconvolution.c         |    45 +
 .../thead_rvv/fp32/deconvolution_gemm_fp32.c  |   179 +
 source/thead_rvv/fp32/depthwise_convolution.c |     5 +-
 .../fp32/depthwise_convolution_3x3_fp32.c     |     2 +-
 .../depthwise_convolution_3x3_fp32_packn.c    |     2 +-
 .../fp32/depthwise_convolution_fp32_nhwc.c    |     2 +-
 .../fp32/depthwise_convolution_fp32_packn.c   |     2 +-
 source/thead_rvv/fp32/div.c                   |    72 +
 source/thead_rvv/fp32/erf.c                   |    86 +
 source/thead_rvv/fp32/fullyconnected.c        |     2 +-
 source/thead_rvv/fp32/fullyconnected_fp32.c   |     6 +-
 source/thead_rvv/fp32/gather.c                |    72 +
 source/thead_rvv/fp32/gemm_fp32.c             |     4 +-
 source/thead_rvv/fp32/gemm_fp32_block.c       |    21 +-
 source/thead_rvv/fp32/gemm_fp32_packn.c       |     8 +-
 source/thead_rvv/fp32/global_avgpool.c        |     2 +-
 source/thead_rvv/fp32/global_avgpool_nhwc.c   |     2 +-
 source/thead_rvv/fp32/global_avgpool_packn.c  |     2 +-
 source/thead_rvv/fp32/global_maxpool.c        |     2 +-
 source/thead_rvv/fp32/global_maxpool_nhwc.c   |     2 +-
 source/thead_rvv/fp32/global_maxpool_packn.c  |     2 +-
 source/thead_rvv/fp32/layer_norm.c            |     2 +-
 source/thead_rvv/fp32/leaky_relu.c            |     2 +-
 source/thead_rvv/fp32/matmul.c                |    51 +-
 source/thead_rvv/fp32/maxpool.c               |     4 +-
 source/thead_rvv/fp32/maxpool_2x2_fp32.c      |     2 +-
 .../thead_rvv/fp32/maxpool_2x2_fp32_packn.c   |     2 +-
 source/thead_rvv/fp32/maxpool_3x3_fp32.c      |     2 +-
 .../thead_rvv/fp32/maxpool_3x3_fp32_packn.c   |     2 +-
 source/thead_rvv/fp32/maxpool_fp32_nhwc.c     |     2 +-
 source/thead_rvv/fp32/maxpool_fp32_packn.c    |     2 +-
 source/thead_rvv/fp32/mul.c                   |    44 +-
 source/thead_rvv/fp32/pad.c                   |     2 +-
 source/thead_rvv/fp32/prelu.c                 |     2 +-
 source/thead_rvv/fp32/relu.c                  |     2 +-
 source/thead_rvv/fp32/relu6.c                 |     2 +-
 source/thead_rvv/fp32/reshape.c               |     4 +-
 source/thead_rvv/fp32/sigmoid.c               |     2 +-
 source/thead_rvv/fp32/softmax.c               |     2 +-
 source/thead_rvv/fp32/sub.c                   |    72 +
 source/thead_rvv/fp32/transpose.c             |     7 +-
 source/thead_rvv/int4/convolution.c           |    10 +-
 source/thead_rvv/int4/convolution_1x1_int4.c  |     2 +-
 .../int4/convolution_1x1_int4_packn.c         |     5 +-
 source/thead_rvv/int4/convolution_gemm_int4.c |     2 +-
 .../int4/convolution_gemm_int4_packn.c        |     2 +-
 source/thead_rvv/int4/depthwise_convolution.c |     2 +-
 .../int4/depthwise_convolution_3x3_int4.c     |     2 +-
 source/thead_rvv/int4/fullyconnected_int4.c   |     2 +-
 source/thead_rvv/int4/gemm_int4_dot.c         |     2 +-
 source/thead_rvv/int4/gemm_int4_dot_packn.c   |     2 +-
 source/thead_rvv/int8/add.c                   |    91 +-
 source/thead_rvv/int8/avgpool.c               |     4 +-
 .../thead_rvv/int8/avgpool_2x2_int8_packn.c   |     2 +-
 .../thead_rvv/int8/avgpool_3x3_int8_packn.c   |     2 +-
 source/thead_rvv/int8/avgpool_int8_nhwc.c     |     2 +-
 source/thead_rvv/int8/avgpool_int8_packn.c    |     2 +-
 source/thead_rvv/int8/clip.c                  |     2 +-
 source/thead_rvv/int8/concat.c                |     2 +-
 source/thead_rvv/int8/convolution.c           |    27 +-
 source/thead_rvv/int8/convolution1d.c         |     6 +-
 source/thead_rvv/int8/convolution1d_1_int8.c  |     6 +-
 source/thead_rvv/int8/convolution_1x1_int8.c  |    39 +-
 .../int8/convolution_1x1_int8_pack1ton.c      |    38 +-
 .../int8/convolution_1x1_int8_packn.c         |    40 +-
 .../int8/convolution_1x1_int8_packnto1.c      |    40 +-
 .../int8/convolution_3x3_int8_packn.c         |     5 +-
 source/thead_rvv/int8/convolution_gemm_int8.c |    36 +-
 .../int8/convolution_gemm_int8_pack1ton.c     |    66 +-
 .../int8/convolution_gemm_int8_packn.c        |    52 +-
 .../int8/convolution_gemm_int8_packnto1.c     |    52 +-
 source/thead_rvv/int8/depthwise_convolution.c |     5 +-
 .../int8/depthwise_convolution1d_int8.c       |     6 +-
 .../int8/depthwise_convolution_3x3_int8.c     |     2 +-
 .../depthwise_convolution_3x3_int8_packn.c    |     2 +-
 .../int8/depthwise_convolution_int8_nhwc.c    |     2 +-
 .../int8/depthwise_convolution_int8_packn.c   |     2 +-
 source/thead_rvv/int8/div.c                   |   109 +
 source/thead_rvv/int8/erf.c                   |    25 +
 source/thead_rvv/int8/fullyconnected.c        |     2 +-
 source/thead_rvv/int8/fullyconnected_int8.c   |     6 +-
 source/thead_rvv/int8/gather.c                |     9 +-
 source/thead_rvv/int8/gemm_int8.c             |     2 +-
 source/thead_rvv/int8/gemm_int8_4xn.c         |     2 +-
 source/thead_rvv/int8/gemm_int8_dot.c         |     2 +-
 source/thead_rvv/int8/gemm_int8_dot_packn.c   |    12 +-
 source/thead_rvv/int8/gemm_int8_packn.c       |     5 +-
 source/thead_rvv/int8/global_avgpool_nhwc.c   |     2 +-
 source/thead_rvv/int8/global_avgpool_packn.c  |     2 +-
 source/thead_rvv/int8/global_maxpool_nhwc.c   |     2 +-
 source/thead_rvv/int8/global_maxpool_packn.c  |     2 +-
 source/thead_rvv/int8/layer_norm.c            |    58 +-
 source/thead_rvv/int8/leaky_relu.c            |     2 +-
 source/thead_rvv/int8/matmul.c                |   182 +
 source/thead_rvv/int8/matmul_int8.c           |   329 +-
 source/thead_rvv/int8/matmul_int8_dot.c       |   466 +
 source/thead_rvv/int8/maxpool.c               |     4 +-
 source/thead_rvv/int8/maxpool_2x2_int8.c      |     2 +-
 .../thead_rvv/int8/maxpool_2x2_int8_packn.c   |     2 +-
 source/thead_rvv/int8/maxpool_3x3_int8.c      |     2 +-
 .../thead_rvv/int8/maxpool_3x3_int8_packn.c   |     2 +-
 source/thead_rvv/int8/maxpool_int8_nhwc.c     |     2 +-
 source/thead_rvv/int8/maxpool_int8_packn.c    |     2 +-
 source/thead_rvv/int8/mul.c                   |    95 +-
 source/thead_rvv/int8/pad.c                   |     2 +-
 source/thead_rvv/int8/prelu.c                 |     2 +-
 source/thead_rvv/int8/reduce_sum.c            |     6 +-
 source/thead_rvv/int8/relu.c                  |     2 +-
 source/thead_rvv/int8/relu6.c                 |    25 +
 source/thead_rvv/int8/reshape.c               |     4 +-
 source/thead_rvv/int8/sigmoid.c               |    25 +
 source/thead_rvv/int8/softmax.c               |    25 +
 source/thead_rvv/int8/sub.c                   |   111 +
 source/thead_rvv/int8/transpose.c             |     7 +-
 source/thead_rvv/reorder.c                    |    40 +-
 source/thead_rvv/setup.c                      |    91 +-
 source/thead_rvv/utils.c                      |  1011 +-
 source/tvm_gen/setup.c                        |     2 +-
 source/tvm_gen/utils.c                        |     2 +-
 source/utils/debug.c                          |    11 +-
 source/utils/export.c                         |    30 +
 source/utils/export_json_wrapper.cpp          |   688 +
 source/utils/export_json_wrapper.h            |    36 +
 source/utils/memory.c                         |     2 +-
 version                                       |     2 +-
 680 files changed, 41164 insertions(+), 4724 deletions(-)
 create mode 100644 cmake/c920v2.cmake
 rename include/{shl_c906.h => backend/c906/c906.h} (99%)
 rename source/c906_opt/shl_c906_cap.h => include/backend/c906/cap.h (100%)
 rename include/{shl_c908.h => backend/c908/c908.h} (97%)
 rename include/{shl_c920.h => backend/c920/c920.h} (96%)
 rename source/c920_opt/shl_c920_cap.h => include/backend/c920/cap.h (100%)
 create mode 100644 include/backend/c920v2/c920v2.h
 create mode 100644 include/backend/c920v2/cap.h
 rename include/{shl_e907.h => backend/e907/e907.h} (99%)
 create mode 100644 include/backend/pnna/pnna.h
 create mode 100644 include/backend/pnna/wrapper.h
 rename include/{shl_ref.h => backend/reference/ref.h} (99%)
 rename include/{shl_thead_rvm.h => backend/rvm/rvm.h} (99%)
 rename source/thead_rvv/shl_thead_rvv_cap.h => include/backend/rvv/cap.h (87%)
 rename include/{shl_thead_rvv.h => backend/rvv/rvv.h} (73%)
 rename include/{ => backend/tvmgen}/shl_tvmgen.h (98%)
 rename include/{ => csinn}/csi_nn.h (99%)
 rename include/{ => csinn}/csinn_data_structure.h (99%)
 rename include/{ => csinn}/csinn_runtime.h (100%)
 rename include/{ => graph}/shl_gref.h (99%)
 rename include/{ => graph}/shl_node.h (100%)
 create mode 100644 include/shl_public/shl_c906.h
 create mode 100644 include/shl_public/shl_c920.h
 create mode 100644 include/shl_public/shl_pnna.h
 create mode 100644 include/shl_public/shl_ref.h
 create mode 100644 module/json/json.hpp
 create mode 100644 source/c920_opt/CMakeLists.txt
 create mode 100644 source/c920_opt/Kconfig
 delete mode 100644 source/c920_opt/convolution.c
 delete mode 100644 source/c920_opt/convolution_1x1_fp16_packn.c
 delete mode 100644 source/c920_opt/convolution_1x1_fp32_packn.c
 delete mode 100644 source/c920_opt/convolution_gemm_fp16_packn.c
 delete mode 100644 source/c920_opt/convolution_gemm_fp32_packn.c
 create mode 100644 source/c920_opt/fp16/convolution.c
 create mode 100644 source/c920_opt/fp16/convolution_1x1_fp16_packn.c
 rename source/c920_opt/{ => fp16}/convolution_3x3_fp16_packn.c (99%)
 create mode 100644 source/c920_opt/fp16/convolution_gemm_fp16_packn.c
 rename source/c920_opt/{ => fp16}/gemm_fp16_block.c (96%)
 rename source/c920_opt/{ => fp16}/gemm_fp16_packn.c (99%)
 rename source/c920_opt/{ => fp16}/matmul_fp16.c (80%)
 create mode 100644 source/c920_opt/fp32/convolution.c
 create mode 100644 source/c920_opt/fp32/convolution_1x1_fp32_packn.c
 rename source/c920_opt/{ => fp32}/convolution_3x3_fp32_packn.c (99%)
 create mode 100644 source/c920_opt/fp32/convolution_gemm_fp32_packn.c
 rename source/c920_opt/{ => fp32}/gemm_fp32_block.c (98%)
 rename source/c920_opt/{ => fp32}/gemm_fp32_packn.c (99%)
 rename source/c920_opt/{ => fp32}/matmul_fp32.c (81%)
 create mode 100644 source/c920_opt/yolox.c
 create mode 100644 source/c920v2_opt/CMakeLists.txt
 create mode 100644 source/c920v2_opt/Kconfig
 create mode 100644 source/c920v2_opt/capability.c
 create mode 100644 source/c920v2_opt/fp16/convolution.c
 create mode 100644 source/c920v2_opt/fp16/convolution_1x1_fp16_pack1ton.c
 create mode 100644 source/c920v2_opt/fp16/convolution_1x1_fp16_packn.c
 create mode 100644 source/c920v2_opt/fp16/convolution_1x1_fp16_packnto1.c
 create mode 100644 source/c920v2_opt/fp16/convolution_gemm_fp16_pack1ton.c
 create mode 100644 source/c920v2_opt/fp16/convolution_gemm_fp16_packn.c
 create mode 100644 source/c920v2_opt/fp16/convolution_gemm_fp16_packnto1.c
 create mode 100644 source/c920v2_opt/fp16/gemm_fp16_ncxhwx.S
 create mode 100644 source/c920v2_opt/fp16/gemm_fp16_packn.c
 create mode 100644 source/c920v2_opt/fp32/convolution.c
 create mode 100644 source/c920v2_opt/fp32/convolution_1x1_fp32_pack1ton.c
 create mode 100644 source/c920v2_opt/fp32/convolution_1x1_fp32_packn.c
 create mode 100644 source/c920v2_opt/fp32/convolution_1x1_fp32_packnto1.c
 create mode 100644 source/c920v2_opt/fp32/convolution_gemm_fp32_pack1ton.c
 create mode 100644 source/c920v2_opt/fp32/convolution_gemm_fp32_packn.c
 create mode 100644 source/c920v2_opt/fp32/convolution_gemm_fp32_packnto1.c
 create mode 100644 source/c920v2_opt/fp32/gemm_fp32_ncxhwx.S
 create mode 100644 source/c920v2_opt/fp32/gemm_fp32_packn.c
 create mode 100644 source/c920v2_opt/int8/convolution.c
 create mode 100644 source/c920v2_opt/int8/convolution_1x1_int8_pack1ton.c
 create mode 100644 source/c920v2_opt/int8/convolution_1x1_int8_packn.c
 create mode 100644 source/c920v2_opt/int8/convolution_1x1_int8_packnto1.c
 create mode 100644 source/c920v2_opt/int8/gemm_int8_dot_ncxhwx.S
 create mode 100644 source/c920v2_opt/int8/gemm_int8_dot_packn.c
 create mode 100644 source/c920v2_opt/int8/gemm_int8_ncxhwx.S
 create mode 100644 source/c920v2_opt/int8/gemm_int8_packn.c
 create mode 100644 source/c920v2_opt/setup.c
 create mode 100644 source/c920v2_opt/utils.c
 create mode 100644 source/thead_rvv/binary_broadcast.c
 create mode 100644 source/thead_rvv/fp16/convolution1d.c
 create mode 100644 source/thead_rvv/fp16/convolution1d_gemm_fp16.c
 create mode 100644 source/thead_rvv/fp16/deconvolution.c
 create mode 100644 source/thead_rvv/fp16/deconvolution_gemm_fp16.c
 create mode 100644 source/thead_rvv/fp16/div.c
 create mode 100644 source/thead_rvv/fp16/erf.c
 create mode 100644 source/thead_rvv/fp16/sub.c
 create mode 100644 source/thead_rvv/fp32/convolution1d.c
 create mode 100644 source/thead_rvv/fp32/convolution1d_gemm_fp32.c
 create mode 100644 source/thead_rvv/fp32/deconvolution.c
 create mode 100644 source/thead_rvv/fp32/deconvolution_gemm_fp32.c
 create mode 100644 source/thead_rvv/fp32/div.c
 create mode 100644 source/thead_rvv/fp32/erf.c
 create mode 100644 source/thead_rvv/fp32/gather.c
 create mode 100644 source/thead_rvv/fp32/sub.c
 create mode 100644 source/thead_rvv/int8/div.c
 create mode 100644 source/thead_rvv/int8/erf.c
 create mode 100644 source/thead_rvv/int8/matmul.c
 create mode 100644 source/thead_rvv/int8/matmul_int8_dot.c
 create mode 100644 source/thead_rvv/int8/relu6.c
 create mode 100644 source/thead_rvv/int8/sigmoid.c
 create mode 100644 source/thead_rvv/int8/softmax.c
 create mode 100644 source/thead_rvv/int8/sub.c
 create mode 100644 source/utils/export.c
 create mode 100644 source/utils/export_json_wrapper.cpp
 create mode 100644 source/utils/export_json_wrapper.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94253829..ee7d50ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,6 +109,22 @@ if(CONFIG_BUILD_RISCV_C920)
     target_include_directories(${SHL_LIB_TARGET} PRIVATE module/dlpack/include/)
 endif()
 
+if(CONFIG_BUILD_RISCV_C920V2)
+    # build c920v2 lib
+    if (NOT CONFIG_CUSTOM_SOURCE_SELECT)
+        include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/c920v2.cmake)
+    endif()
+    include(cmake/rules.cmake)
+
+    set(SHL_LIB_TARGET "c920v2_lib")
+    set(SHL_LIB_NAME shl_c920v2)
+    LIST(APPEND SHL_BUILD_SRC_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C920V2_SRCS})
+    set(SHL_BUILD_C_FLAGS -ffp-contract=off -march=rv64gcv_zfh_xtheadc_xtheadvdot -mabi=lp64d -DSHL_BUILD_C920V2 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV)
+    include(cmake/target_build.cmake)
+    target_include_directories(${SHL_LIB_TARGET} PRIVATE module/dlpack/include/)
+endif()
+
+
 if(CONFIG_BUILD_RISCV_ELF_C906)
     # build c906 elf a
     if (NOT CONFIG_CUSTOM_SOURCE_SELECT)
diff --git a/Makefile b/Makefile
index 00a67349..b12a4bca 100644
--- a/Makefile
+++ b/Makefile
@@ -28,6 +28,9 @@ nn2_c908:
 nn2_c920:
 	mkdir -p c920_build; cd c920_build; cmake ../ -DCONFIG_BUILD_RISCV_C920=ON -DCONFIG_SHL_BUILD_STATIC=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/c920/; make -j${USE_CORE}; make install; cd -
 
+nn2_c920v2:
+	mkdir -p c920v2_build; cd c920v2_build; cmake ../ -DCONFIG_BUILD_RISCV_C920V2=ON -DCONFIG_SHL_BUILD_STATIC=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/c920v2/; make -j${USE_CORE}; make install; cd -
+
 nn2_c920_so:
 	mkdir -p c920_build_so; cd c920_build_so; cmake ../ -DCONFIG_BUILD_RISCV_C920=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/c920/; make -j${USE_CORE}; make install; cd -
 
diff --git a/cmake/c906_elf.cmake b/cmake/c906_elf.cmake
index bfb56989..1ce73c27 100644
--- a/cmake/c906_elf.cmake
+++ b/cmake/c906_elf.cmake
@@ -338,11 +338,24 @@ set(CONFIG_THEAD_RVV_CONVOLUTION_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
 set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
 set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
@@ -373,6 +386,7 @@ set(CONFIG_THEAD_RVV_PAD_INT8 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
 set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
 set(CONFIG_THEAD_RVV_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_RELU_INT8 ON)
@@ -384,15 +398,17 @@ set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
 set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
-set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
-set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
-set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
-set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_C906_SOURCE ON)
 set(CONFIG_C906_ABS_FP32 ON)
 set(CONFIG_C906_ABS_FP16 ON)
diff --git a/cmake/c906_share.cmake b/cmake/c906_share.cmake
index bfb56989..1ce73c27 100644
--- a/cmake/c906_share.cmake
+++ b/cmake/c906_share.cmake
@@ -338,11 +338,24 @@ set(CONFIG_THEAD_RVV_CONVOLUTION_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
 set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
 set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
@@ -373,6 +386,7 @@ set(CONFIG_THEAD_RVV_PAD_INT8 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
 set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
 set(CONFIG_THEAD_RVV_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_RELU_INT8 ON)
@@ -384,15 +398,17 @@ set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
 set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
-set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
-set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
-set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
-set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_C906_SOURCE ON)
 set(CONFIG_C906_ABS_FP32 ON)
 set(CONFIG_C906_ABS_FP16 ON)
diff --git a/cmake/c906_static.cmake b/cmake/c906_static.cmake
index cb722f6c..39a76d82 100644
--- a/cmake/c906_static.cmake
+++ b/cmake/c906_static.cmake
@@ -319,6 +319,7 @@ set(CONFIG_GRAPH_REFERENCE_XOR ON)
 set(CONFIG_GRAPH_REFERENCE_YUV_RGB_SCALE ON)
 set(CONFIG_GRAPH_REFERENCE_ONE_HOT ON)
 set(CONFIG_GRAPH_REFERENCE_INSTANCE_NORM ON)
+set(CONFIG_GRAPH_REFERENCE_TVMGEN ON)
 set(CONFIG_THEAD_RVV_SOURCE ON)
 set(CONFIG_THEAD_RVV_ADD_FP32 ON)
 set(CONFIG_THEAD_RVV_ADD_FP16 ON)
@@ -340,12 +341,25 @@ set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT4 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
 set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT4 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
 set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
@@ -377,6 +391,7 @@ set(CONFIG_THEAD_RVV_PAD_INT8 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
 set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
 set(CONFIG_THEAD_RVV_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_RELU_INT8 ON)
@@ -388,16 +403,17 @@ set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
 set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
-set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
-set(CONFIG_GRAPH_REFERENCE_TVMGEN ON)
-set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
-set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
-set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_C906_SOURCE ON)
 set(CONFIG_C906_ABS_FP32 ON)
 set(CONFIG_C906_ABS_FP16 ON)
diff --git a/cmake/c908.cmake b/cmake/c908.cmake
index 8e8d96b3..0fa5483a 100644
--- a/cmake/c908.cmake
+++ b/cmake/c908.cmake
@@ -340,12 +340,25 @@ set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT4 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
 set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT4 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
 set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
@@ -377,6 +390,7 @@ set(CONFIG_THEAD_RVV_PAD_INT8 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
 set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
 set(CONFIG_THEAD_RVV_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_RELU_INT8 ON)
@@ -388,15 +402,17 @@ set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
 set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
-set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
-set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
-set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
-set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_C908_SOURCE ON)
 set(CONFIG_C908_AVERAGEPOOL_FP32 ON)
 set(CONFIG_C908_AVERAGEPOOL_FP16 ON)
diff --git a/cmake/c920.cmake b/cmake/c920.cmake
index f1e18d07..4be329bf 100644
--- a/cmake/c920.cmake
+++ b/cmake/c920.cmake
@@ -339,11 +339,24 @@ set(CONFIG_THEAD_RVV_CONVOLUTION_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
 set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
 set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
@@ -374,6 +387,7 @@ set(CONFIG_THEAD_RVV_PAD_INT8 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
 set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
 set(CONFIG_THEAD_RVV_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_RELU_INT8 ON)
@@ -385,14 +399,24 @@ set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
 set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
-set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
-set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
-set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
-set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
+set(CONFIG_C920_SOURCE ON)
+set(CONFIG_C920_CONVOLUTION_FP32 ON)
+set(CONFIG_C920_CONVOLUTION_FP16 ON)
+set(CONFIG_C920_GEMM_FP32 ON)
+set(CONFIG_C920_GEMM_FP16 ON)
+set(CONFIG_C920_MATMUL_FP32 ON)
+set(CONFIG_C920_MATMUL_FP16 ON)
 set(CONFIG_USE_SHL_DEBUG ON)
-set(CONFIG_SHL_LAYER_BENCHMARK ON)
\ No newline at end of file
+set(CONFIG_SHL_LAYER_BENCHMARK ON)
+set(CONFIG_USE_EXPORT_MODEL ON)
\ No newline at end of file
diff --git a/cmake/c920v2.cmake b/cmake/c920v2.cmake
new file mode 100644
index 00000000..179e0ace
--- /dev/null
+++ b/cmake/c920v2.cmake
@@ -0,0 +1,421 @@
+set(CONFIG_C_REFERENCE_SOURCE ON)
+set(CONFIG_C_REFERENCE_ABS ON)
+set(CONFIG_C_REFERENCE_ACOS ON)
+set(CONFIG_C_REFERENCE_ACOSH ON)
+set(CONFIG_C_REFERENCE_ADD ON)
+set(CONFIG_C_REFERENCE_AND ON)
+set(CONFIG_C_REFERENCE_ARANGE ON)
+set(CONFIG_C_REFERENCE_ARGMAX ON)
+set(CONFIG_C_REFERENCE_ARGMIN ON)
+set(CONFIG_C_REFERENCE_ASIN ON)
+set(CONFIG_C_REFERENCE_ASINH ON)
+set(CONFIG_C_REFERENCE_ATAN ON)
+set(CONFIG_C_REFERENCE_ATANH ON)
+set(CONFIG_C_REFERENCE_AVERAGEPOOL ON)
+set(CONFIG_C_REFERENCE_AVERAGEPOOL3D ON)
+set(CONFIG_C_REFERENCE_BATCH_NORMALIZATION ON)
+set(CONFIG_C_REFERENCE_BATCH_TO_SPACE ON)
+set(CONFIG_C_REFERENCE_BROADCAST_TO ON)
+set(CONFIG_C_REFERENCE_CACHE_CONV1D ON)
+set(CONFIG_C_REFERENCE_CACHE_MATMUL ON)
+set(CONFIG_C_REFERENCE_CAST ON)
+set(CONFIG_C_REFERENCE_CEIL ON)
+set(CONFIG_C_REFERENCE_CLIP ON)
+set(CONFIG_C_REFERENCE_COL2IM ON)
+set(CONFIG_C_REFERENCE_CONCAT ON)
+set(CONFIG_C_REFERENCE_CONVOLUTION_CHANNEL ON)
+set(CONFIG_C_REFERENCE_CONVOLUTION_RELU ON)
+set(CONFIG_C_REFERENCE_CONVOLUTION_RELU6 ON)
+set(CONFIG_C_REFERENCE_CONVOLUTION ON)
+set(CONFIG_C_REFERENCE_CONVOLUTION1D ON)
+set(CONFIG_C_REFERENCE_CONVOLUTION3D ON)
+set(CONFIG_C_REFERENCE_COS ON)
+set(CONFIG_C_REFERENCE_COSH ON)
+set(CONFIG_C_REFERENCE_CUMPROD ON)
+set(CONFIG_C_REFERENCE_CUMSUM ON)
+set(CONFIG_C_REFERENCE_DATA_CONVERT ON)
+set(CONFIG_C_REFERENCE_DECONVOLUTION ON)
+set(CONFIG_C_REFERENCE_DECONVOLUTION3D ON)
+set(CONFIG_C_REFERENCE_DEPTH_TO_SPACE ON)
+set(CONFIG_C_REFERENCE_DIV ON)
+set(CONFIG_C_REFERENCE_ELU ON)
+set(CONFIG_C_REFERENCE_EQUAL ON)
+set(CONFIG_C_REFERENCE_ERF ON)
+set(CONFIG_C_REFERENCE_EXP ON)
+set(CONFIG_C_REFERENCE_EXPAND_DIMS ON)
+set(CONFIG_C_REFERENCE_EXPM1 ON)
+set(CONFIG_C_REFERENCE_FLATTEN ON)
+set(CONFIG_C_REFERENCE_FLOOR_DIVIDE ON)
+set(CONFIG_C_REFERENCE_FLOOR_MOD ON)
+set(CONFIG_C_REFERENCE_FLOOR ON)
+set(CONFIG_C_REFERENCE_FSMN ON)
+set(CONFIG_C_REFERENCE_FULLYCONNECTED ON)
+set(CONFIG_C_REFERENCE_GATHER_ND ON)
+set(CONFIG_C_REFERENCE_GATHER ON)
+set(CONFIG_C_REFERENCE_GLOBAL_AVERAGEPOOL ON)
+set(CONFIG_C_REFERENCE_GLOBAL_MAXPOOL ON)
+set(CONFIG_C_REFERENCE_GREATER_EQUAL ON)
+set(CONFIG_C_REFERENCE_GREATER ON)
+set(CONFIG_C_REFERENCE_HARD_SIGMOID ON)
+set(CONFIG_C_REFERENCE_IM2COL ON)
+set(CONFIG_C_REFERENCE_ISNAN ON)
+set(CONFIG_C_REFERENCE_L2_NORMALIZATION ON)
+set(CONFIG_C_REFERENCE_L2POOL ON)
+set(CONFIG_C_REFERENCE_LAYER_NORM ON)
+set(CONFIG_C_REFERENCE_LEAKY_RELU ON)
+set(CONFIG_C_REFERENCE_LESS_EQUAL ON)
+set(CONFIG_C_REFERENCE_LESS ON)
+set(CONFIG_C_REFERENCE_LOG_SOFTMAX ON)
+set(CONFIG_C_REFERENCE_LOG ON)
+set(CONFIG_C_REFERENCE_LOG1P ON)
+set(CONFIG_C_REFERENCE_LOGICAL_AND ON)
+set(CONFIG_C_REFERENCE_LOGICAL_NOT ON)
+set(CONFIG_C_REFERENCE_LOGICAL_OR ON)
+set(CONFIG_C_REFERENCE_LOGICAL_XOR ON)
+set(CONFIG_C_REFERENCE_LRN ON)
+set(CONFIG_C_REFERENCE_MATMUL ON)
+set(CONFIG_C_REFERENCE_MAX ON)
+set(CONFIG_C_REFERENCE_MAXIMUM ON)
+set(CONFIG_C_REFERENCE_MAXPOOL ON)
+set(CONFIG_C_REFERENCE_MAXPOOL2D_LOCAT ON)
+set(CONFIG_C_REFERENCE_MAXPOOL3D ON)
+set(CONFIG_C_REFERENCE_MEAN ON)
+set(CONFIG_C_REFERENCE_MIN ON)
+set(CONFIG_C_REFERENCE_MINIMUM ON)
+set(CONFIG_C_REFERENCE_MOD ON)
+set(CONFIG_C_REFERENCE_MUL ON)
+set(CONFIG_C_REFERENCE_NDARRAY_SIZE ON)
+set(CONFIG_C_REFERENCE_NEGATIVE ON)
+set(CONFIG_C_REFERENCE_NON_MAX_SUPPRESSION ON)
+set(CONFIG_C_REFERENCE_NOT_EQUAL ON)
+set(CONFIG_C_REFERENCE_NOT ON)
+set(CONFIG_C_REFERENCE_OR ON)
+set(CONFIG_C_REFERENCE_PAD ON)
+set(CONFIG_C_REFERENCE_POWER ON)
+set(CONFIG_C_REFERENCE_PRELU ON)
+set(CONFIG_C_REFERENCE_PROD ON)
+set(CONFIG_C_REFERENCE_PROPOSAL ON)
+set(CONFIG_C_REFERENCE_PSROIPOOLING ON)
+set(CONFIG_C_REFERENCE_REDUCE_LOGSUMEXP ON)
+set(CONFIG_C_REFERENCE_REDUCE_MAX ON)
+set(CONFIG_C_REFERENCE_REDUCE_MEAN ON)
+set(CONFIG_C_REFERENCE_REDUCE_MIN ON)
+set(CONFIG_C_REFERENCE_REDUCE_PROD ON)
+set(CONFIG_C_REFERENCE_REDUCE_SUM ON)
+set(CONFIG_C_REFERENCE_RELU ON)
+set(CONFIG_C_REFERENCE_RELU1 ON)
+set(CONFIG_C_REFERENCE_RELU6 ON)
+set(CONFIG_C_REFERENCE_RELUN ON)
+set(CONFIG_C_REFERENCE_RESHAPE ON)
+set(CONFIG_C_REFERENCE_RESIZE ON)
+set(CONFIG_C_REFERENCE_REVERSE ON)
+set(CONFIG_C_REFERENCE_ROIALIGN ON)
+set(CONFIG_C_REFERENCE_ROIPOOL ON)
+set(CONFIG_C_REFERENCE_ROUND ON)
+set(CONFIG_C_REFERENCE_RSQRT ON)
+set(CONFIG_C_REFERENCE_SCATTER ON)
+set(CONFIG_C_REFERENCE_SEGMENT_MAX ON)
+set(CONFIG_C_REFERENCE_SEGMENT_MEAN ON)
+set(CONFIG_C_REFERENCE_SEGMENT_MIN ON)
+set(CONFIG_C_REFERENCE_SEGMENT_PROD ON)
+set(CONFIG_C_REFERENCE_SEGMENT_SUM ON)
+set(CONFIG_C_REFERENCE_SELECT ON)
+set(CONFIG_C_REFERENCE_SHAPE ON)
+set(CONFIG_C_REFERENCE_SHUFFLE_CHANNEL ON)
+set(CONFIG_C_REFERENCE_SIGMOID ON)
+set(CONFIG_C_REFERENCE_SIGN ON)
+set(CONFIG_C_REFERENCE_SIN ON)
+set(CONFIG_C_REFERENCE_SINH ON)
+set(CONFIG_C_REFERENCE_SLICE ON)
+set(CONFIG_C_REFERENCE_SOFTMAX ON)
+set(CONFIG_C_REFERENCE_SOFTPLUS ON)
+set(CONFIG_C_REFERENCE_SOFTRELU ON)
+set(CONFIG_C_REFERENCE_SOFTSIGN ON)
+set(CONFIG_C_REFERENCE_SPACE_TO_BATCH ON)
+set(CONFIG_C_REFERENCE_SPACE_TO_DEPTH ON)
+set(CONFIG_C_REFERENCE_SPLIT ON)
+set(CONFIG_C_REFERENCE_SQRT ON)
+set(CONFIG_C_REFERENCE_SQUARE ON)
+set(CONFIG_C_REFERENCE_SQUEEZE ON)
+set(CONFIG_C_REFERENCE_STACK ON)
+set(CONFIG_C_REFERENCE_STRIDED_SLICE ON)
+set(CONFIG_C_REFERENCE_SUB ON)
+set(CONFIG_C_REFERENCE_SUM ON)
+set(CONFIG_C_REFERENCE_TAN ON)
+set(CONFIG_C_REFERENCE_TANH ON)
+set(CONFIG_C_REFERENCE_THRESHOLD_RELU ON)
+set(CONFIG_C_REFERENCE_TILE ON)
+set(CONFIG_C_REFERENCE_TOPK ON)
+set(CONFIG_C_REFERENCE_TRANSPOSE ON)
+set(CONFIG_C_REFERENCE_TRUNC ON)
+set(CONFIG_C_REFERENCE_UNPOOLING ON)
+set(CONFIG_C_REFERENCE_UNSTACK ON)
+set(CONFIG_C_REFERENCE_XOR ON)
+set(CONFIG_C_REFERENCE_YUV_RGB_SCALE ON)
+set(CONFIG_C_REFERENCE_ONE_HOT ON)
+set(CONFIG_C_REFERENCE_WHERE ON)
+set(CONFIG_C_REFERENCE_WHERE_SOFTMAX ON)
+set(CONFIG_C_REFERENCE_INSTANCE_NORM ON)
+set(CONFIG_GRAPH_REFERENCE_SOURCE ON)
+set(CONFIG_GRAPH_REFERENCE_ABS ON)
+set(CONFIG_GRAPH_REFERENCE_ACOS ON)
+set(CONFIG_GRAPH_REFERENCE_ACOSH ON)
+set(CONFIG_GRAPH_REFERENCE_ADD ON)
+set(CONFIG_GRAPH_REFERENCE_ALL ON)
+set(CONFIG_GRAPH_REFERENCE_AND ON)
+set(CONFIG_GRAPH_REFERENCE_ANY ON)
+set(CONFIG_GRAPH_REFERENCE_ARANGE ON)
+set(CONFIG_GRAPH_REFERENCE_ARGMAX ON)
+set(CONFIG_GRAPH_REFERENCE_ARGMIN ON)
+set(CONFIG_GRAPH_REFERENCE_ASIN ON)
+set(CONFIG_GRAPH_REFERENCE_ASINH ON)
+set(CONFIG_GRAPH_REFERENCE_ATAN ON)
+set(CONFIG_GRAPH_REFERENCE_ATANH ON)
+set(CONFIG_GRAPH_REFERENCE_AVERAGEPOOL ON)
+set(CONFIG_GRAPH_REFERENCE_AVERAGEPOOL3D ON)
+set(CONFIG_GRAPH_REFERENCE_BATCH_NORMALIZATION ON)
+set(CONFIG_GRAPH_REFERENCE_BATCH_TO_SPACE ON)
+set(CONFIG_GRAPH_REFERENCE_BATCH_TO_SPACE_ND ON)
+set(CONFIG_GRAPH_REFERENCE_BROADCAST_TO ON)
+set(CONFIG_GRAPH_REFERENCE_CACHE_CONV1D ON)
+set(CONFIG_GRAPH_REFERENCE_CACHE_MATMUL ON)
+set(CONFIG_GRAPH_REFERENCE_CAST ON)
+set(CONFIG_GRAPH_REFERENCE_CEIL ON)
+set(CONFIG_GRAPH_REFERENCE_CLIP ON)
+set(CONFIG_GRAPH_REFERENCE_COL2IM ON)
+set(CONFIG_GRAPH_REFERENCE_CONCAT ON)
+set(CONFIG_GRAPH_REFERENCE_CONVOLUTION_CHANNEL ON)
+set(CONFIG_GRAPH_REFERENCE_CONVOLUTION_RELU ON)
+set(CONFIG_GRAPH_REFERENCE_CONVOLUTION_RELU6 ON)
+set(CONFIG_GRAPH_REFERENCE_CONVOLUTION ON)
+set(CONFIG_GRAPH_REFERENCE_CONVOLUTION1D ON)
+set(CONFIG_GRAPH_REFERENCE_CONVOLUTION3D ON)
+set(CONFIG_GRAPH_REFERENCE_COS ON)
+set(CONFIG_GRAPH_REFERENCE_COSH ON)
+set(CONFIG_GRAPH_REFERENCE_CUMPROD ON)
+set(CONFIG_GRAPH_REFERENCE_CUMSUM ON)
+set(CONFIG_GRAPH_REFERENCE_DATA_CONVERT ON)
+set(CONFIG_GRAPH_REFERENCE_DECONVOLUTION ON)
+set(CONFIG_GRAPH_REFERENCE_DECONVOLUTION3D ON)
+set(CONFIG_GRAPH_REFERENCE_DEPTH_TO_SPACE ON)
+set(CONFIG_GRAPH_REFERENCE_DIV ON)
+set(CONFIG_GRAPH_REFERENCE_ELU ON)
+set(CONFIG_GRAPH_REFERENCE_EQUAL ON)
+set(CONFIG_GRAPH_REFERENCE_ERF ON)
+set(CONFIG_GRAPH_REFERENCE_EXP ON)
+set(CONFIG_GRAPH_REFERENCE_EXPAND_DIMS ON)
+set(CONFIG_GRAPH_REFERENCE_EXPM1 ON)
+set(CONFIG_GRAPH_REFERENCE_FLATTEN ON)
+set(CONFIG_GRAPH_REFERENCE_FLOOR_DIVIDE ON)
+set(CONFIG_GRAPH_REFERENCE_FLOOR_MOD ON)
+set(CONFIG_GRAPH_REFERENCE_FLOOR ON)
+set(CONFIG_GRAPH_REFERENCE_FSMN ON)
+set(CONFIG_GRAPH_REFERENCE_FULLYCONNECTED ON)
+set(CONFIG_GRAPH_REFERENCE_GATHER_ND ON)
+set(CONFIG_GRAPH_REFERENCE_GATHER ON)
+set(CONFIG_GRAPH_REFERENCE_GLOBAL_AVERAGEPOOL ON)
+set(CONFIG_GRAPH_REFERENCE_GLOBAL_MAXPOOL ON)
+set(CONFIG_GRAPH_REFERENCE_GREATER_EQUAL ON)
+set(CONFIG_GRAPH_REFERENCE_GREATER ON)
+set(CONFIG_GRAPH_REFERENCE_HARD_SIGMOID ON)
+set(CONFIG_GRAPH_REFERENCE_IM2COL ON)
+set(CONFIG_GRAPH_REFERENCE_ISNAN ON)
+set(CONFIG_GRAPH_REFERENCE_L2_NORMALIZATION ON)
+set(CONFIG_GRAPH_REFERENCE_L2POOL ON)
+set(CONFIG_GRAPH_REFERENCE_LAYER_NORMAL ON)
+set(CONFIG_GRAPH_REFERENCE_LEAKY_RELU ON)
+set(CONFIG_GRAPH_REFERENCE_LESS_EQUAL ON)
+set(CONFIG_GRAPH_REFERENCE_LESS ON)
+set(CONFIG_GRAPH_REFERENCE_LOG_SOFTMAX ON)
+set(CONFIG_GRAPH_REFERENCE_LOG ON)
+set(CONFIG_GRAPH_REFERENCE_LOG1P ON)
+set(CONFIG_GRAPH_REFERENCE_LOGICAL_AND ON)
+set(CONFIG_GRAPH_REFERENCE_LOGICAL_NOT ON)
+set(CONFIG_GRAPH_REFERENCE_LOGICAL_OR ON)
+set(CONFIG_GRAPH_REFERENCE_LOGICAL_XOR ON)
+set(CONFIG_GRAPH_REFERENCE_LRN ON)
+set(CONFIG_GRAPH_REFERENCE_MATMUL ON)
+set(CONFIG_GRAPH_REFERENCE_MAX ON)
+set(CONFIG_GRAPH_REFERENCE_MAXIMUM ON)
+set(CONFIG_GRAPH_REFERENCE_MAXPOOL ON)
+set(CONFIG_GRAPH_REFERENCE_MAXPOOL2D_LOCAT ON)
+set(CONFIG_GRAPH_REFERENCE_MAXPOOL3D ON)
+set(CONFIG_GRAPH_REFERENCE_MEAN ON)
+set(CONFIG_GRAPH_REFERENCE_MIN ON)
+set(CONFIG_GRAPH_REFERENCE_MINIMUM ON)
+set(CONFIG_GRAPH_REFERENCE_MOD ON)
+set(CONFIG_GRAPH_REFERENCE_MUL ON)
+set(CONFIG_GRAPH_REFERENCE_NDARRAY_SIZE ON)
+set(CONFIG_GRAPH_REFERENCE_NEGATIVE ON)
+set(CONFIG_GRAPH_REFERENCE_NON_MAX_SUPPRESSION ON)
+set(CONFIG_GRAPH_REFERENCE_NOT_EQUAL ON)
+set(CONFIG_GRAPH_REFERENCE_NOT ON)
+set(CONFIG_GRAPH_REFERENCE_OR ON)
+set(CONFIG_GRAPH_REFERENCE_PAD ON)
+set(CONFIG_GRAPH_REFERENCE_POWER ON)
+set(CONFIG_GRAPH_REFERENCE_PRELU ON)
+set(CONFIG_GRAPH_REFERENCE_PROD ON)
+set(CONFIG_GRAPH_REFERENCE_PROPOSAL ON)
+set(CONFIG_GRAPH_REFERENCE_PSROIPOOLING ON)
+set(CONFIG_GRAPH_REFERENCE_REDUCE_LOGSUMEXP ON)
+set(CONFIG_GRAPH_REFERENCE_REDUCE_MAX ON)
+set(CONFIG_GRAPH_REFERENCE_REDUCE_MEAN ON)
+set(CONFIG_GRAPH_REFERENCE_REDUCE_MIN ON)
+set(CONFIG_GRAPH_REFERENCE_REDUCE_PROD ON)
+set(CONFIG_GRAPH_REFERENCE_REDUCE_SUM ON)
+set(CONFIG_GRAPH_REFERENCE_RELU ON)
+set(CONFIG_GRAPH_REFERENCE_RELU1 ON)
+set(CONFIG_GRAPH_REFERENCE_RELU6 ON)
+set(CONFIG_GRAPH_REFERENCE_RELUN ON)
+set(CONFIG_GRAPH_REFERENCE_RESHAPE ON)
+set(CONFIG_GRAPH_REFERENCE_RESIZE ON)
+set(CONFIG_GRAPH_REFERENCE_REVERSE ON)
+set(CONFIG_GRAPH_REFERENCE_ROIALIGN ON)
+set(CONFIG_GRAPH_REFERENCE_ROIPOOL ON)
+set(CONFIG_GRAPH_REFERENCE_ROUND ON)
+set(CONFIG_GRAPH_REFERENCE_RSQRT ON)
+set(CONFIG_GRAPH_REFERENCE_SCATTER ON)
+set(CONFIG_GRAPH_REFERENCE_SEGMENT_MAX ON)
+set(CONFIG_GRAPH_REFERENCE_SEGMENT_MEAN ON)
+set(CONFIG_GRAPH_REFERENCE_SEGMENT_MIN ON)
+set(CONFIG_GRAPH_REFERENCE_SEGMENT_PROD ON)
+set(CONFIG_GRAPH_REFERENCE_SEGMENT_SUM ON)
+set(CONFIG_GRAPH_REFERENCE_SELECT ON)
+set(CONFIG_GRAPH_REFERENCE_SEQUENCE_MASK ON)
+set(CONFIG_GRAPH_REFERENCE_SHAPE ON)
+set(CONFIG_GRAPH_REFERENCE_SHUFFLE_CHANNEL ON)
+set(CONFIG_GRAPH_REFERENCE_SIGMOID ON)
+set(CONFIG_GRAPH_REFERENCE_SIGN ON)
+set(CONFIG_GRAPH_REFERENCE_SIN ON)
+set(CONFIG_GRAPH_REFERENCE_SINH ON)
+set(CONFIG_GRAPH_REFERENCE_SLICE ON)
+set(CONFIG_GRAPH_REFERENCE_SOFTMAX ON)
+set(CONFIG_GRAPH_REFERENCE_SOFTPLUS ON)
+set(CONFIG_GRAPH_REFERENCE_SOFTRELU ON)
+set(CONFIG_GRAPH_REFERENCE_SOFTSIGN ON)
+set(CONFIG_GRAPH_REFERENCE_SPACE_TO_BATCH ON)
+set(CONFIG_GRAPH_REFERENCE_SPACE_TO_BATCH_ND ON)
+set(CONFIG_GRAPH_REFERENCE_SPACE_TO_DEPTH ON)
+set(CONFIG_GRAPH_REFERENCE_SPLIT ON)
+set(CONFIG_GRAPH_REFERENCE_SQRT ON)
+set(CONFIG_GRAPH_REFERENCE_SQUARE ON)
+set(CONFIG_GRAPH_REFERENCE_SQUEEZE ON)
+set(CONFIG_GRAPH_REFERENCE_STACK ON)
+set(CONFIG_GRAPH_REFERENCE_STRIDED_SLICE ON)
+set(CONFIG_GRAPH_REFERENCE_SUB ON)
+set(CONFIG_GRAPH_REFERENCE_SUM ON)
+set(CONFIG_GRAPH_REFERENCE_TAN ON)
+set(CONFIG_GRAPH_REFERENCE_TANH ON)
+set(CONFIG_GRAPH_REFERENCE_THRESHOLD_RELU ON)
+set(CONFIG_GRAPH_REFERENCE_TILE ON)
+set(CONFIG_GRAPH_REFERENCE_TOPK ON)
+set(CONFIG_GRAPH_REFERENCE_TRANSPOSE ON)
+set(CONFIG_GRAPH_REFERENCE_TRUNC ON)
+set(CONFIG_GRAPH_REFERENCE_UNPOOLING ON)
+set(CONFIG_GRAPH_REFERENCE_UNSTACK ON)
+set(CONFIG_GRAPH_REFERENCE_WHERE ON)
+set(CONFIG_GRAPH_REFERENCE_WHERE_SOFTMAX ON)
+set(CONFIG_GRAPH_REFERENCE_XOR ON)
+set(CONFIG_GRAPH_REFERENCE_YUV_RGB_SCALE ON)
+set(CONFIG_GRAPH_REFERENCE_ONE_HOT ON)
+set(CONFIG_GRAPH_REFERENCE_INSTANCE_NORM ON)
+set(CONFIG_GRAPH_REFERENCE_TVMGEN ON)
+set(CONFIG_THEAD_RVV_SOURCE ON)
+set(CONFIG_THEAD_RVV_ADD_FP32 ON)
+set(CONFIG_THEAD_RVV_ADD_FP16 ON)
+set(CONFIG_THEAD_RVV_ADD_INT8 ON)
+set(CONFIG_THEAD_RVV_AVERAGEPOOL_FP32 ON)
+set(CONFIG_THEAD_RVV_AVERAGEPOOL_FP16 ON)
+set(CONFIG_THEAD_RVV_AVERAGEPOOL_INT8 ON)
+set(CONFIG_THEAD_RVV_CLIP_FP32 ON)
+set(CONFIG_THEAD_RVV_CLIP_FP16 ON)
+set(CONFIG_THEAD_RVV_CLIP_INT8 ON)
+set(CONFIG_THEAD_RVV_CONCAT_FP32 ON)
+set(CONFIG_THEAD_RVV_CONCAT_FP16 ON)
+set(CONFIG_THEAD_RVV_CONCAT_INT8 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION_INT8 ON)
+set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
+set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
+set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
+set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
+set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
+set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
+set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
+set(CONFIG_THEAD_RVV_GLOBAL_AVERAGEPOOL_FP32 ON)
+set(CONFIG_THEAD_RVV_GLOBAL_AVERAGEPOOL_FP16 ON)
+set(CONFIG_THEAD_RVV_GLOBAL_AVERAGEPOOL_INT8 ON)
+set(CONFIG_THEAD_RVV_GLOBAL_MAXPOOL_FP32 ON)
+set(CONFIG_THEAD_RVV_GLOBAL_MAXPOOL_FP16 ON)
+set(CONFIG_THEAD_RVV_GLOBAL_MAXPOOL_INT8 ON)
+set(CONFIG_THEAD_RVV_LAYER_NORM_FP32 ON)
+set(CONFIG_THEAD_RVV_LAYER_NORM_FP16 ON)
+set(CONFIG_THEAD_RVV_LAYER_NORM_INT8 ON)
+set(CONFIG_THEAD_RVV_LEAKY_RELU_FP32 ON)
+set(CONFIG_THEAD_RVV_LEAKY_RELU_FP16 ON)
+set(CONFIG_THEAD_RVV_LEAKY_RELU_INT8 ON)
+set(CONFIG_THEAD_RVV_MATMUL_FP32 ON)
+set(CONFIG_THEAD_RVV_MATMUL_FP16 ON)
+set(CONFIG_THEAD_RVV_MATMUL_INT8 ON)
+set(CONFIG_THEAD_RVV_MAXPOOL_FP32 ON)
+set(CONFIG_THEAD_RVV_MAXPOOL_FP16 ON)
+set(CONFIG_THEAD_RVV_MAXPOOL_INT8 ON)
+set(CONFIG_THEAD_RVV_MUL_FP32 ON)
+set(CONFIG_THEAD_RVV_MUL_FP16 ON)
+set(CONFIG_THEAD_RVV_MUL_INT8 ON)
+set(CONFIG_THEAD_RVV_PAD_FP32 ON)
+set(CONFIG_THEAD_RVV_PAD_FP16 ON)
+set(CONFIG_THEAD_RVV_PAD_INT8 ON)
+set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
+set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
+set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_RELU_FP32 ON)
+set(CONFIG_THEAD_RVV_RELU_FP16 ON)
+set(CONFIG_THEAD_RVV_RELU_INT8 ON)
+set(CONFIG_THEAD_RVV_RELU6_FP32 ON)
+set(CONFIG_THEAD_RVV_RELU6_FP16 ON)
+set(CONFIG_THEAD_RVV_RELU6_INT8 ON)
+set(CONFIG_THEAD_RVV_RESHAPE_FP32 ON)
+set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
+set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
+set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
+set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
+set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
+set(CONFIG_C920V2_SOURCE ON)
+set(CONFIG_C920V2_CONVOLUTION_FP32 ON)
+set(CONFIG_C920V2_CONVOLUTION_FP16 ON)
+set(CONFIG_C920V2_CONVOLUTION_INT8 ON)
+set(CONFIG_C920V2_GEMM_FP32 ON)
+set(CONFIG_C920V2_GEMM_FP16 ON)
+set(CONFIG_C920V2_GEMM_INT8 ON)
+set(CONFIG_USE_SHL_DEBUG ON)
+set(CONFIG_SHL_LAYER_BENCHMARK ON)
\ No newline at end of file
diff --git a/cmake/rules.cmake b/cmake/rules.cmake
index 69196a86..ce16eec5 100644
--- a/cmake/rules.cmake
+++ b/cmake/rules.cmake
@@ -3,8 +3,8 @@ if (NOT CONFIG_USE_COMPILER_PATH)
 # riscv linux compiler
 if (CONFIG_BUILD_RISCV_RVV OR CONFIG_BUILD_RISCV_C906 OR
     CONFIG_BUILD_RISCV_RVM OR CONFIG_BUILD_RISCV_C908 OR
-    CONFIG_BUILD_RISCV_C920 OR CONFIG_BUILD_RISCV_PNNA OR
-    CONFIG_BUILD_TH1520)
+    CONFIG_BUILD_RISCV_C920 OR CONFIG_BUILD_RISCV_C920V2 OR
+    CONFIG_BUILD_RISCV_PNNA OR CONFIG_BUILD_TH1520)
     set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc)
     set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++)
     set(CMAKE_ASM_COMPILER riscv64-unknown-linux-gnu-gcc)
@@ -14,12 +14,7 @@ endif()
 if (CONFIG_BUILD_RISCV_ELF_C906 OR CONFIG_BUILD_RISCV_ELF_E907)
     set(CMAKE_ASM_COMPILER riscv64-unknown-elf-gcc)
     set(CMAKE_C_COMPILER riscv64-unknown-elf-gcc)
-endif()
-
-# csky linux compiler
-if (CONFIG_BUILD_CSKY_OPENVX)
-    set(CMAKE_C_COMPILER csky-abiv2-linux-gcc)
-    set(CMAKE_ASM_COMPILER csky-abiv2-linux-gcc)
+    set(CMAKE_CXX_COMPILER riscv64-unknown-elf-gcc)
 endif()
 
 endif()
@@ -29,6 +24,11 @@ if(CONFIG_USE_SHL_DEBUG)
     add_definitions(-D SHL_DEBUG)
 endif()
 
+# SHL export model
+if(CONFIG_USE_EXPORT_MODEL)
+    add_definitions(-D SHL_EXPORT_MODEL)
+endif()
+
 # reduce elf size
 if (CONFIG_BUILD_ANDROID_TH1520)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_ANDROID")
@@ -39,17 +39,19 @@ endif()
 # set warning as error
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
 
-file(GLOB_RECURSE NN2_SRCS source/nn2/*.c source/utils/*.c)
+file(GLOB_RECURSE NN2_SRCS source/nn2/*.c source/utils/*.c source/utils/*.cpp)
 include(source/reference/CMakeLists.txt)
 include(source/graph_ref/CMakeLists.txt)
 include(source/thead_rvv/CMakeLists.txt)
+file(GLOB_RECURSE PNNA_SRCS source/pnna/*.c source/pnna/*.cpp)
 file(GLOB_RECURSE THEAD_MATRIX_SRCS source/thead_matrix/*.c source/thead_matrix/*.S)
-file(GLOB_RECURSE C920_SRCS source/c920_opt/*.c source/c920_opt/*.S)
+include(source/c920_opt/CMakeLists.txt)
 include(source/c906_opt/CMakeLists.txt)
 include(source/c908_opt/CMakeLists.txt)
 include(source/e907_opt/CMakeLists.txt)
+include(source/c920v2_opt/CMakeLists.txt)
 
-include_directories(include)
+include_directories(include include/csinn include/graph include/backend)
 
 if(CONFIG_SHL_LAYER_BENCHMARK)
     add_definitions(-DSHL_LAYER_BENCHMARK)
diff --git a/cmake/rvm.cmake b/cmake/rvm.cmake
index 6adf0a4d..a7ccba45 100644
--- a/cmake/rvm.cmake
+++ b/cmake/rvm.cmake
@@ -340,12 +340,25 @@ set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT4 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
 set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT4 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
 set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
@@ -377,6 +390,7 @@ set(CONFIG_THEAD_RVV_PAD_INT8 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
 set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
 set(CONFIG_THEAD_RVV_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_RELU_INT8 ON)
@@ -388,13 +402,15 @@ set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
 set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
-set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
-set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
-set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
-set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_USE_SHL_DEBUG ON)
\ No newline at end of file
diff --git a/cmake/rvv.cmake b/cmake/rvv.cmake
index 6adf0a4d..a7ccba45 100644
--- a/cmake/rvv.cmake
+++ b/cmake/rvv.cmake
@@ -340,12 +340,25 @@ set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT4 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32 ON)
+set(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16 ON)
 set(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8 ON)
 set(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP32 ON)
+set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_FP32 ON)
+set(CONFIG_THEAD_RVV_DIV_FP16 ON)
+set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_ERF_FP32 ON)
+set(CONFIG_THEAD_RVV_ERF_FP16 ON)
+set(CONFIG_THEAD_RVV_ERF_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT4 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP32 ON)
+set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
+set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP32 ON)
 set(CONFIG_THEAD_RVV_GEMM_FP16 ON)
 set(CONFIG_THEAD_RVV_GEMM_INT8 ON)
@@ -377,6 +390,7 @@ set(CONFIG_THEAD_RVV_PAD_INT8 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP32 ON)
 set(CONFIG_THEAD_RVV_PRELU_FP16 ON)
 set(CONFIG_THEAD_RVV_PRELU_INT8 ON)
+set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
 set(CONFIG_THEAD_RVV_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_RELU_INT8 ON)
@@ -388,13 +402,15 @@ set(CONFIG_THEAD_RVV_RESHAPE_FP16 ON)
 set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
+set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
+set(CONFIG_THEAD_RVV_SUB_FP32 ON)
+set(CONFIG_THEAD_RVV_SUB_FP16 ON)
+set(CONFIG_THEAD_RVV_SUB_INT8 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP32 ON)
 set(CONFIG_THEAD_RVV_SOFTMAX_FP16 ON)
-set(CONFIG_THEAD_RVV_REDUCE_SUM_INT8 ON)
+set(CONFIG_THEAD_RVV_SOFTMAX_INT8 ON)
+set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP32 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_FP16 ON)
 set(CONFIG_THEAD_RVV_TRANSPOSE_INT8 ON)
-set(CONFIG_THEAD_RVV_GATHER_FP16 ON)
-set(CONFIG_THEAD_RVV_GATHER_INT8 ON)
-set(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16 ON)
 set(CONFIG_USE_SHL_DEBUG ON)
\ No newline at end of file
diff --git a/include/shl_c906.h b/include/backend/c906/c906.h
similarity index 99%
rename from include/shl_c906.h
rename to include/backend/c906/c906.h
index 6f6adf6c..d1f1e169 100644
--- a/include/shl_c906.h
+++ b/include/backend/c906/c906.h
@@ -20,9 +20,9 @@
 #define INCLUDE_SHL_C906_H_
 
 #include "csi_nn.h"
+#include "reference/ref.h"
+#include "rvv/rvv.h"
 #include "shl_gref.h"
-#include "shl_ref.h"
-#include "shl_thead_rvv.h"
 
 /************************** f32 func declaration ***************************/
 int shl_c906_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -375,6 +375,8 @@ void shl_c906_conv1x1s1_sgemm_transform_kernel_fp16_w_int8(struct csinn_tensor *
                                                            struct csinn_conv2d_params *params);
 void shl_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel,
                                                       struct csinn_conv2d_params *params);
+void shl_c906_conv_im2col_sgemm_transform_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                             struct csinn_conv2d_params *params);
 
 void shl_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel,
                                                                struct csinn_tensor *t_kernel);
diff --git a/source/c906_opt/shl_c906_cap.h b/include/backend/c906/cap.h
similarity index 100%
rename from source/c906_opt/shl_c906_cap.h
rename to include/backend/c906/cap.h
diff --git a/include/shl_c908.h b/include/backend/c908/c908.h
similarity index 97%
rename from include/shl_c908.h
rename to include/backend/c908/c908.h
index 834e1a33..0d86bec8 100644
--- a/include/shl_c908.h
+++ b/include/backend/c908/c908.h
@@ -20,9 +20,9 @@
 #define INCLUDE_SHL_C908_H_
 
 #include "csi_nn.h"
+#include "reference/ref.h"
+#include "rvv/rvv.h"
 #include "shl_gref.h"
-#include "shl_ref.h"
-#include "shl_thead_rvv.h"
 
 /*********************************** initialization ***********************************/
 int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -281,16 +281,16 @@ int shl_c908_ncxhwx_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csin
                                          struct csinn_conv2d_params *params);
 
 /*********************************** gemm ncxhwx kernel ***********************************/
-void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb,
-                                         const float *bias, int m, int k, int n, bool fuse_relu);
+void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
+                                         int m, int k, int n, bool fuse_relu);
 void shl_c908_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                         const __fp16 *bias, int m, int k, int n, bool fuse_relu);
+                                         __fp16 *bias, int m, int k, int n, bool fuse_relu);
 void shl_c908_ncxhwx_gemm_4xpack2n_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                        const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                        int32_t *bias, int m, int k, int n, int32_t out_zp,
                                         int32_t *mult, int32_t *shift);
 void shl_c908_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                            const int32_t *bias, int m, int k, int n,
-                                            int32_t out_zp, int32_t *mult, int32_t *shift);
+                                            int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                            int32_t *mult, int32_t *shift);
 
 void shl_c908_ncxhwx_gemm_12xpackn_int16(int32_t *dst, const int16_t *sa, const int16_t *sb, int m,
                                          int k, int n);
diff --git a/include/shl_c920.h b/include/backend/c920/c920.h
similarity index 96%
rename from include/shl_c920.h
rename to include/backend/c920/c920.h
index 54130ad6..a3362058 100644
--- a/include/shl_c920.h
+++ b/include/backend/c920/c920.h
@@ -20,9 +20,9 @@
 #define INCLUDE_SHL_C920_H_
 
 #include "csi_nn.h"
+#include "reference/ref.h"
+#include "rvv/rvv.h"
 #include "shl_gref.h"
-#include "shl_ref.h"
-#include "shl_thead_rvv.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -71,9 +71,9 @@ int shl_c920_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tenso
 
 /************************************* gemm ncxhwx ************************************/
 void shl_c920_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
-                                        int m, int k, int n, int ldc);
+                                        int m, int k, int n, bool fuse_relu);
 void shl_c920_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                        __fp16 *bias, int m, int k, int n, int ldc);
+                                        __fp16 *bias, int m, int k, int n, bool fuse_relu);
 
 /************************************* gemm block *************************************/
 void shl_c920_reorder_kernel_block_8xk_fp32(float *src, float *dst, int m, int k, const int M_BLK,
@@ -119,6 +119,7 @@ void shl_c920_set_binary_model_op_init(struct csinn_session *sess, bool value);
 int shl_c920_detect_yolov5_postprocess(struct csinn_tensor **input_tensors,
                                        struct shl_yolov5_box *out,
                                        struct shl_yolov5_params *params);
+int shl_c920_yolox_preprocess(struct csinn_tensor *input, struct csinn_tensor *output);
 
 #ifdef __cplusplus
 }
diff --git a/source/c920_opt/shl_c920_cap.h b/include/backend/c920/cap.h
similarity index 100%
rename from source/c920_opt/shl_c920_cap.h
rename to include/backend/c920/cap.h
diff --git a/include/backend/c920v2/c920v2.h b/include/backend/c920v2/c920v2.h
new file mode 100644
index 00000000..9a4f054d
--- /dev/null
+++ b/include/backend/c920v2/c920v2.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_C920V2_H_
+#define INCLUDE_SHL_C920V2_H_
+
+#include "csi_nn.h"
+#include "reference/ref.h"
+#include "rvv/rvv.h"
+#include "shl_gref.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************************** initialization ***********************************/
+int shl_c920v2_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+int shl_c920v2_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+
+/************************************* convolution ************************************/
+/*********************************** im2col + gemm ********************************/
+int shl_c920v2_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+int shl_c920v2_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+
+int shl_c920v2_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params);
+int shl_c920v2_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params);
+
+int shl_c920v2_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params);
+int shl_c920v2_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params);
+
+/******************************** conv2d1x1s1 + gemm ******************************/
+int shl_c920v2_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_c920v2_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_c920v2_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+int shl_c920v2_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c920v2_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c920v2_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+
+int shl_c920v2_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c920v2_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c920v2_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+
+/************************************* gemm ncxhwx ************************************/
+void shl_c920v2_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb,
+                                           float *bias, int m, int k, int n, bool fuse_relu);
+void shl_c920v2_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                           __fp16 *bias, int m, int k, int n, bool fuse_relu);
+void shl_c920v2_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                              int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                              int32_t *mult, int32_t *shift);
+void shl_c920v2_ncxhwx_gemm_4xpack2n_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                          int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                          int32_t *mult, int32_t *shift);
+
+struct shl_c920v2_option {
+    struct shl_rvv_option base;
+};
+
+int shl_c920v2_set_packn_layout(struct csinn_session *sess, bool packn_layout);
+struct shl_c920v2_option *shl_c920v2_get_graph_option(struct csinn_session *sess);
+bool shl_c920v2_get_binary_model_op_init(struct csinn_session *sess);
+void shl_c920v2_set_binary_model_op_init(struct csinn_session *sess, bool value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_SHL_C920V2_H_
diff --git a/include/backend/c920v2/cap.h b/include/backend/c920v2/cap.h
new file mode 100644
index 00000000..c95a7fd3
--- /dev/null
+++ b/include/backend/c920v2/cap.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_C920V2_CAP_H_
+#define INCLUDE_SHL_C920V2_CAP_H_
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+int shl_c920v2_conv2d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params);
+
+#endif  // INCLUDE_SHL_C920V2_CAP_H_
diff --git a/include/shl_e907.h b/include/backend/e907/e907.h
similarity index 99%
rename from include/shl_e907.h
rename to include/backend/e907/e907.h
index e13c44c5..827d078d 100644
--- a/include/shl_e907.h
+++ b/include/backend/e907/e907.h
@@ -24,8 +24,8 @@
 #endif  //__riscv_dsp
 
 #include "csi_nn.h"
+#include "reference/ref.h"
 #include "shl_gref.h"
-#include "shl_ref.h"
 
 int shl_e907_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/include/backend/pnna/pnna.h b/include/backend/pnna/pnna.h
new file mode 100644
index 00000000..6dbec972
--- /dev/null
+++ b/include/backend/pnna/pnna.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_PNNA_H_
+#define INCLUDE_SHL_PNNA_H_
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+int shl_pnna_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                    struct csinn_conv2d_params *params);
+
+int shl_pnna_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+
+int shl_pnna_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params);
+
+int shl_pnna_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params);
+
+int shl_pnna_depthwise_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+
+int shl_pnna_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *weights, struct csinn_tensor *bias,
+                            struct csinn_fc_params *params);
+
+int shl_pnna_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params);
+
+int shl_pnna_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params);
+
+int shl_pnna_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+
+int shl_pnna_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+
+int shl_pnna_negative(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_pnna_tanh(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_pnna_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_sigmoid_params *params);
+
+int shl_pnna_elu(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params);
+
+int shl_pnna_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_relu_params *params);
+
+int shl_pnna_relu1(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params);
+
+int shl_pnna_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params);
+
+int shl_pnna_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
+
+int shl_pnna_prelu(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                   struct csinn_tensor *output, struct csinn_prelu_params *params);
+
+int shl_pnna_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_softmax_params *params);
+
+int shl_pnna_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                 struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                 struct csinn_tensor *beta, struct csinn_tensor *output,
+                                 struct csinn_bn_params *params);
+
+int shl_pnna_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_l2n_params *params);
+
+int shl_pnna_lrn(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_lrn_params *params);
+
+int shl_pnna_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                    struct csinn_tensor *output, struct csinn_matmul_params *params);
+
+int shl_pnna_add(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_sub(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_mul(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_div(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_power(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_greater(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_less(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_select(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                    struct csinn_tensor *input1, struct csinn_tensor *output,
+                    struct csinn_diso_params *params);
+
+int shl_pnna_and(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_or(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_pnna_pad(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pad_params *params);
+
+int shl_pnna_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_resize_params *params);
+
+int shl_pnna_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                    struct csinn_concat_params *params);
+
+int shl_pnna_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_transpose_params *params);
+
+int shl_pnna_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reshape_params *params);
+
+int shl_pnna_shape(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_shape_params *params);
+
+int shl_pnna_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_flatten_params *params);
+
+int shl_pnna_crop(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_crop_params *params);
+
+int shl_pnna_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_slice_params *params);
+
+int shl_pnna_split(struct csinn_tensor *input, struct csinn_tensor **output,
+                   struct csinn_split_params *params);
+
+int shl_pnna_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_squeeze_params *params);
+
+int shl_pnna_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_space_to_batch_nd_params *params);
+
+int shl_pnna_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_batch_to_space_nd_params *params);
+
+int shl_pnna_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_space_to_depth_params *params);
+
+int shl_pnna_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_depth_to_space_params *params);
+
+int shl_pnna_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_pnna_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reduce_params *params);
+
+int shl_pnna_max(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_pnna_min(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_pnna_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reduce_params *params);
+
+int shl_pnna_argmin(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params);
+
+int shl_pnna_argmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params);
+
+int shl_pnna_all(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_pnna_any(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_pnna_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_strided_slice_params *params);
+
+int shl_pnna_roipool(struct csinn_tensor *data, struct csinn_tensor *rois,
+                     struct csinn_tensor *output, struct csinn_roi_pool_params *params);
+
+int shl_pnna_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                      struct csinn_tensor *im_info, struct csinn_tensor *output,
+                      struct csinn_proposal_params *params);
+
+int shl_pnna_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask,
+                       struct csinn_tensor *output, struct csinn_unpooling_params *params);
+
+int shl_pnna_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_pool_params *params);
+int shl_pnna_sqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+int shl_pnna_matmul(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_matmul_params *params);
+
+int shl_pnna_data_covert(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params);
+
+int shl_pnna_set_input_strides(struct csinn_session *sess, int input_byte_size, int input_fix_h,
+                               int input_fix_w);
+
+struct shl_pnna_tensor_fix {
+    int height;
+    int width;
+};
+
+struct shl_pnna_target_data {
+    void *network;
+    void *net_obj;
+    void *context;
+    void *binding;
+    void *attrs;
+    void *graph;
+    void *nodes;
+    void *in_buffers;
+    void *out_buffers;
+    void *th1520_hwconfig;
+    void *th1520_mapconfig;
+    void *to_free;
+    int priority;
+    struct shl_pnna_tensor_fix **input_fix;
+    enum csinn_quant_enum quant_type;
+};
+
+#endif  // INCLUDE_SHL_PNNA_H_
diff --git a/include/backend/pnna/wrapper.h b/include/backend/pnna/wrapper.h
new file mode 100644
index 00000000..18b86182
--- /dev/null
+++ b/include/backend/pnna/wrapper.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_PNNA_WRAPPER_H_
+#define INCLUDE_SHL_PNNA_WRAPPER_H_
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int shl_pnna_session_init_internal(struct shl_pnna_target_data *td);
+int shl_pnna_session_deinit_internal(struct shl_pnna_target_data *td);
+int shl_pnna_session_setup_internal(struct shl_pnna_target_data *td);
+int shl_pnna_session_create_network_binary(struct csinn_session *sess,
+                                           struct shl_pnna_target_data *td);
+int shl_pnna_session_run_internal(struct csinn_session *sess, int input_num, int output_num);
+void shl_pnna_load_binary_model_internal(void *addr, size_t size, struct shl_pnna_target_data *td);
+int shl_pnna_create_tensor_internal(struct csinn_tensor *t, struct shl_pnna_target_data *td);
+int shl_pnna_set_output_internal(int index, struct csinn_tensor *t,
+                                 struct shl_pnna_target_data *td);
+int shl_pnna_update_input_internal(int index, void *buffer, struct csinn_session *sess);
+int shl_pnna_get_output_internal(int index, struct csinn_tensor *output,
+                                 struct shl_pnna_target_data *td);
+void shl_pnna_set_input_strides_internal(struct shl_pnna_target_data *td, int byte_size,
+                                         int input_fix_h, int input_fix_w);
+int shl_pnna_create_io_memory(struct csinn_session *sess);
+
+/* internal op */
+int shl_pnna_create_argmax_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_reduce_params *params,
+                                    struct shl_pnna_target_data *td);
+int shl_pnna_create_avgpool_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_pool_params *params,
+                                     struct shl_pnna_target_data *td);
+int shl_pnna_create_batch_to_space_nd_internal(struct csinn_tensor *input,
+                                               struct csinn_tensor *output,
+                                               struct csinn_batch_to_space_nd_params *params,
+                                               struct shl_pnna_target_data *td);
+int shl_pnna_create_concat_internal(struct csinn_tensor **input, struct csinn_tensor *output,
+                                    struct csinn_concat_params *params,
+                                    struct shl_pnna_target_data *td);
+int shl_pnna_create_conv2d_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params,
+                                    struct shl_pnna_target_data *td);
+int shl_pnna_create_deconv2d_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params,
+                                      struct shl_pnna_target_data *td);
+int shl_pnna_create_dense_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_fc_params *params, struct shl_pnna_target_data *td);
+int shl_pnna_create_depth_to_space_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_depth_to_space_params *params,
+                                            struct shl_pnna_target_data *td);
+int shl_pnna_create_depthwise_conv2d_internal(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params, struct shl_pnna_target_data *td);
+int shl_pnna_create_diso_internal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                  struct csinn_tensor *output, int op,
+                                  struct shl_pnna_target_data *td);
+int shl_pnna_create_flatten_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_flatten_params *params,
+                                     struct shl_pnna_target_data *td);
+int shl_pnna_create_group_conv2d_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params,
+                                          struct shl_pnna_target_data *td);
+int shl_pnna_create_global_avgpool_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_pool_params *params,
+                                            struct shl_pnna_target_data *td);
+int shl_pnna_create_global_maxpool_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_pool_params *params,
+                                            struct shl_pnna_target_data *td);
+int shl_pnna_create_leaky_relu_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_relu_params *params,
+                                        struct shl_pnna_target_data *td);
+int shl_pnna_create_lrn_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_lrn_params *params, struct shl_pnna_target_data *td);
+int shl_pnna_create_mean_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_reduce_params *params,
+                                  struct shl_pnna_target_data *td);
+int shl_pnna_create_maxpool_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_pool_params *params,
+                                     struct shl_pnna_target_data *td);
+int shl_pnna_create_maxpool2d_locat_internal(struct csinn_tensor *data, struct csinn_tensor *output,
+                                             struct csinn_pool_params *params,
+                                             struct shl_pnna_target_data *td);
+int shl_pnna_create_pad_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pad_params *params, struct shl_pnna_target_data *td);
+int shl_pnna_create_prelu_internal(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                                   struct csinn_tensor *output, struct csinn_prelu_params *params,
+                                   struct shl_pnna_target_data *td);
+int shl_pnna_create_proposal_internal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                                      struct csinn_tensor *im_info, struct csinn_tensor *output,
+                                      struct csinn_proposal_params *params,
+                                      struct shl_pnna_target_data *td);
+int shl_pnna_create_relu1_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_relu_params *params,
+                                   struct shl_pnna_target_data *td);
+int shl_pnna_create_relu6_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_relu_params *params,
+                                   struct shl_pnna_target_data *td);
+int shl_pnna_create_reshape_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_reshape_params *params,
+                                     struct shl_pnna_target_data *td);
+int shl_pnna_create_resize_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_resize_params *params,
+                                    struct shl_pnna_target_data *td);
+int shl_pnna_create_roipool_internal(struct csinn_tensor *data, struct csinn_tensor *rois,
+                                     struct csinn_tensor *output,
+                                     struct csinn_roi_pool_params *params,
+                                     struct shl_pnna_target_data *td);
+int shl_pnna_create_siso_internal(struct csinn_tensor *input, struct csinn_tensor *output, int op,
+                                  struct shl_pnna_target_data *td);
+int shl_pnna_create_softmax_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_softmax_params *params,
+                                     struct shl_pnna_target_data *td);
+int shl_pnna_create_space_to_depth_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_space_to_depth_params *params,
+                                            struct shl_pnna_target_data *td);
+int shl_pnna_create_space_to_batch_nd_internal(struct csinn_tensor *input,
+                                               struct csinn_tensor *output,
+                                               struct csinn_space_to_batch_nd_params *params,
+                                               struct shl_pnna_target_data *td);
+int shl_pnna_create_split_internal(struct csinn_tensor *input, struct csinn_tensor **output,
+                                   struct csinn_split_params *params,
+                                   struct shl_pnna_target_data *td);
+int shl_pnna_create_squeeze_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_squeeze_params *params,
+                                     struct shl_pnna_target_data *td);
+int shl_pnna_create_strided_slice_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_strided_slice_params *params,
+                                           struct shl_pnna_target_data *td);
+int shl_pnna_create_cus_strided_slice_internal(struct csinn_tensor *input,
+                                               struct csinn_tensor *output,
+                                               struct csinn_strided_slice_params *params,
+                                               struct shl_pnna_target_data *td);
+int shl_pnna_create_transpose_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_transpose_params *params,
+                                       struct shl_pnna_target_data *td);
+int shl_pnna_create_unpooling_internal(struct csinn_tensor *input, struct csinn_tensor *mask,
+                                       struct csinn_tensor *output,
+                                       struct csinn_unpooling_params *params,
+                                       struct shl_pnna_target_data *td);
+int shl_pnna_create_matmul_internal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                    struct csinn_tensor *output, struct csinn_matmul_params *params,
+                                    struct shl_pnna_target_data *td);
+int shl_pnna_create_data_convert_internal(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_siso_params *params,
+                                          struct shl_pnna_target_data *td);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_SHL_PNNA_WRAPPER_H_
diff --git a/include/shl_ref.h b/include/backend/reference/ref.h
similarity index 99%
rename from include/shl_ref.h
rename to include/backend/reference/ref.h
index 463cac1d..2da2477a 100644
--- a/include/shl_ref.h
+++ b/include/backend/reference/ref.h
@@ -1253,10 +1253,6 @@ int shl_ref_conv_callback_base(struct csinn_tensor *input, struct csinn_tensor *
                                struct csinn_tensor *kernel, struct csinn_tensor *bias, void *params,
                                void *cb);
 
-void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output);
-
-void shl_ref_nn_deinit(struct csinn_tensor *input, struct csinn_tensor *output);
-
 int shl_ref_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_reshape_params *params);
 
diff --git a/include/shl_thead_rvm.h b/include/backend/rvm/rvm.h
similarity index 99%
rename from include/shl_thead_rvm.h
rename to include/backend/rvm/rvm.h
index 41ba18b2..9f0d2f50 100644
--- a/include/shl_thead_rvm.h
+++ b/include/backend/rvm/rvm.h
@@ -19,7 +19,7 @@
 #ifndef INCLUDE_SHL_THEAD_RVM_H_
 #define INCLUDE_SHL_THEAD_RVM_H_
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 #ifdef __riscv_xtheadmatrix
 #include <riscv_matrix.h>
diff --git a/source/thead_rvv/shl_thead_rvv_cap.h b/include/backend/rvv/cap.h
similarity index 87%
rename from source/thead_rvv/shl_thead_rvv_cap.h
rename to include/backend/rvv/cap.h
index 93fc173b..24ee42ae 100644
--- a/source/thead_rvv/shl_thead_rvv_cap.h
+++ b/include/backend/rvv/cap.h
@@ -34,6 +34,10 @@ int shl_rvv_conv1d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
                        struct csinn_conv1d_params *params);
 
+int shl_rvv_deconv2d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
 int shl_rvv_fullyconnected_cap(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_tensor *weights, struct csinn_tensor *bias,
                                struct csinn_fc_params *params);
@@ -47,9 +51,15 @@ int shl_rvv_avgpool2d_cap(struct csinn_tensor *input, struct csinn_tensor *outpu
 int shl_rvv_add_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
                     struct csinn_tensor *output, struct csinn_diso_params *params);
 
+int shl_rvv_sub_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
 int shl_rvv_mul_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
                     struct csinn_tensor *output, struct csinn_diso_params *params);
 
+int shl_rvv_div_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
 int shl_rvv_concat_cap(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_clip_params *params);
 
@@ -99,4 +109,7 @@ int shl_rvv_matmul_cap(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
 int shl_rvv_gather_cap(struct csinn_tensor *input, struct csinn_tensor *indices,
                        struct csinn_tensor *output, struct csinn_gather_params *params);
 
+int shl_rvv_erf_cap(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_clip_params *params);
+
 #endif  // INCLUDE_SHL_RVV_CAP_H_
diff --git a/include/shl_thead_rvv.h b/include/backend/rvv/rvv.h
similarity index 73%
rename from include/shl_thead_rvv.h
rename to include/backend/rvv/rvv.h
index 6f16f4ff..a8a0d413 100644
--- a/include/shl_thead_rvv.h
+++ b/include/backend/rvv/rvv.h
@@ -37,8 +37,8 @@
 #endif  // __riscv_vector
 
 #include "csi_nn.h"
+#include "reference/ref.h"
 #include "shl_gref.h"
-#include "shl_ref.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -55,6 +55,12 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
                              struct csinn_conv2d_params *params);
 
+int shl_rvv_conv1d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv1d_params *params);
+int shl_rvv_conv1d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv1d_params *params);
 int shl_rvv_conv1d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
                              struct csinn_conv1d_params *params);
@@ -72,6 +78,13 @@ int shl_rvv_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                        struct csinn_conv2d_params *params);
 
+int shl_rvv_deconv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params);
+int shl_rvv_deconv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params);
+
 int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params);
 int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -118,16 +131,189 @@ int shl_rvv_matmul_init_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat
                              struct csinn_tensor *output, struct csinn_matmul_params *params);
 int shl_rvv_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                              struct csinn_tensor *output, struct csinn_matmul_params *params);
+int shl_rvv_matmul_init_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                             struct csinn_tensor *output, struct csinn_matmul_params *params);
 
 /************************************ convolution *********************************/
+/********************************* common im2col+gemm *****************************/
+int shl_rvv_common_conv_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params,
+                                  void (*reorder_input)(float *, float *, int, int, int),
+                                  void (*gemm)(float *, const float *, const float *, float *, int,
+                                               int, int, int));
+int shl_rvv_common_conv_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params,
+                                  void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                  void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *,
+                                               int, int, int, int));
+int shl_rvv_common_conv_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params,
+                                  void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                  void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *,
+                                               int, int, int, int, int32_t, int32_t *, int32_t *));
+
+int shl_rvv_common_conv_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params,
+                                        void (*reorder_input)(float *, float *, int, int, int),
+                                        void (*gemm)(float *, const float *, const float *, float *,
+                                                     int, int, int, bool));
+int shl_rvv_common_conv_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params,
+                                        void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                        void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                     __fp16 *, int, int, int, bool));
+int shl_rvv_common_conv_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params,
+                                        void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                        void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                     int32_t *, int, int, int, int32_t, int32_t *,
+                                                     int32_t *));
+
+int shl_rvv_common_conv_gemm_pack1ton_fp32(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(float *, float *, int, int, int, int),
+    void (*gemm)(float *, const float *, const float *, float *, int, int, int, bool));
+int shl_rvv_common_conv_gemm_pack1ton_fp16(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(__fp16 *, __fp16 *, int, int, int, int),
+    void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *, int, int, int, bool));
+int shl_rvv_common_conv_gemm_pack1ton_int8(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(int8_t *, int8_t *, int, int, int, int),
+    void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *, int, int, int, int32_t,
+                 int32_t *, int32_t *));
+
+int shl_rvv_common_conv_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(float *, float *, int, int, int),
+                                           void (*gemm)(float *, const float *, const float *,
+                                                        float *, int, int, int, bool));
+int shl_rvv_common_conv_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                           void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                        __fp16 *, int, int, int, bool));
+int shl_rvv_common_conv_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                           void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                        int32_t *, int, int, int, int32_t,
+                                                        int32_t *, int32_t *));
+
+int shl_rvv_common_conv1x1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params,
+                                     void (*reorder_input)(float *, float *, int, int, int),
+                                     void (*gemm)(float *, const float *, const float *, float *,
+                                                  int, int, int, int));
+int shl_rvv_common_conv1x1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params,
+                                     void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                     void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                  __fp16 *, int, int, int, int));
+int shl_rvv_common_conv1x1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params,
+                                     void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                     void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                  int32_t *, int, int, int, int, int32_t, int32_t *,
+                                                  int32_t *));
+
+int shl_rvv_common_conv1x1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(float *, float *, int, int, int),
+                                           void (*gemm)(float *, const float *, const float *,
+                                                        float *, int, int, int, bool));
+int shl_rvv_common_conv1x1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                           void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                        __fp16 *, int, int, int, bool));
+int shl_rvv_common_conv1x1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                           void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                        int32_t *, int, int, int, int32_t,
+                                                        int32_t *, int32_t *));
+
+int shl_rvv_common_conv1x1_gemm_pack1ton_fp32(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(float *, float *, int, int, int, int),
+    void (*gemm)(float *, const float *, const float *, float *, int, int, int, bool));
+int shl_rvv_common_conv1x1_gemm_pack1ton_fp16(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(__fp16 *, __fp16 *, int, int, int, int),
+    void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *, int, int, int, bool));
+int shl_rvv_common_conv1x1_gemm_pack1ton_int8(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(int8_t *, int8_t *, int, int, int, int),
+    void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *, int, int, int, int32_t,
+                 int32_t *, int32_t *));
+
+int shl_rvv_common_conv1x1_gemm_packnto1_fp32(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(float *, float *, int, int, int),
+    void (*gemm)(float *, const float *, const float *, float *, int, int, int, bool));
+int shl_rvv_common_conv1x1_gemm_packnto1_fp16(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+    void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *, int, int, int, bool));
+int shl_rvv_common_conv1x1_gemm_packnto1_int8(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+    void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *, int, int, int, int32_t,
+                 int32_t *, int32_t *));
+
 /*********************************** im2col + gemm ********************************/
+void shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                    struct csinn_conv1d_params *params);
+void shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                    struct csinn_conv1d_params *params);
+void shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                           struct csinn_conv1d_params *params);
+void shl_rvv_conv1d_im2col_gemm_dequantize_per_channel_i8_to_f16(struct csinn_tensor *kernel,
+                                                                 struct csinn_conv1d_params *params,
+                                                                 __fp16 *kernel_fp16);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
                                                   struct csinn_conv2d_params *params);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
                                                   struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16(struct csinn_tensor *kernel,
+                                                               struct csinn_conv2d_params *params,
+                                                               __fp16 *kernel_fp16);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                                   struct csinn_conv2d_params *params);
 
+int shl_rvv_conv1d_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv1d_params *params);
+int shl_rvv_conv1d_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv1d_params *params);
 int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                   struct csinn_conv2d_params *params);
@@ -142,6 +328,10 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *ker
                                                         struct csinn_conv2d_params *params);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
                                                         struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(struct csinn_tensor *kernel,
+                                                               struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_packn_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
                                                         struct csinn_conv2d_params *params);
 
@@ -159,6 +349,10 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *
                                                            struct csinn_conv2d_params *params);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
                                                            struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_pack1ton_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
                                                            struct csinn_conv2d_params *params);
 
@@ -176,6 +370,10 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *
                                                            struct csinn_conv2d_params *params);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
                                                            struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_packnto1_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16);
 void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
                                                            struct csinn_conv2d_params *params);
 
@@ -194,6 +392,8 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params);
 
@@ -211,6 +411,8 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kerne
                                                       struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
                                                       struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8(struct csinn_tensor *kernel,
+                                                             struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
                                                       struct csinn_conv2d_params *params);
 
@@ -228,6 +430,8 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *ke
                                                          struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8(struct csinn_tensor *kernel,
+                                                                struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params);
 
@@ -245,6 +449,8 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *ke
                                                          struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8(struct csinn_tensor *kernel,
+                                                                struct csinn_conv2d_params *params);
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params);
 
@@ -325,6 +531,8 @@ void shl_rvv_dwconv_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
                                               struct csinn_conv2d_params *params);
 void shl_rvv_dwconv_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
                                               struct csinn_conv2d_params *params);
+void shl_rvv_dwconv_reorder_kernel_packn_fp16_w_int8(struct csinn_tensor *kernel,
+                                                     struct csinn_conv2d_params *params);
 void shl_rvv_dwconv_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
                                               struct csinn_conv2d_params *params);
 
@@ -367,6 +575,28 @@ int shl_rvv_dwconv_nhwc_int8(struct csinn_tensor *input, struct csinn_tensor *ou
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
                              struct csinn_conv2d_params *params);
 
+/************************************ deconvolution *********************************/
+/************************************ gemm + col2im *********************************/
+void shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params);
+
+int shl_rvv_deconv2d_gemm_col2im_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+
+void shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params);
+
+void shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                             struct csinn_conv2d_params *params);
+
+void shl_rvv_deconv2d_gemm_col2im_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16);
+
+int shl_rvv_deconv2d_gemm_col2im_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+
 /*************************************** gemm *************************************/
 void shl_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx);
 void shl_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx);
@@ -421,27 +651,27 @@ void shl_rvv_gemm_4xn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, cons
 void shl_rvv_reorder_kernel_packn_fp32(float *a, float *sa, int m, int k, int ldx);
 void shl_rvv_reorder_input_z8_packn_fp32(float *b, float *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
-                                       int m, int k, int n, int ldc);
+                                       int m, int k, int n, bool fuse_relu);
 void shl_rvv_reorder_input_z12_packn_fp32(float *b, float *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
-                                        int m, int k, int n, int ldc);
+                                        int m, int k, int n, bool fuse_relu);
 
 void shl_rvv_reorder_kernel_packn_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
 void shl_rvv_reorder_input_z8_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                       __fp16 *bias, int m, int k, int n, int ldc);
+                                       __fp16 *bias, int m, int k, int n, bool fuse_relu);
 void shl_rvv_reorder_input_z12_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                        __fp16 *bias, int m, int k, int n, int ldc);
+                                        __fp16 *bias, int m, int k, int n, bool fuse_relu);
 
 void shl_rvv_reorder_input_z8_packn_int8_dot(int8_t *b, int8_t *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_8xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                          int32_t *bias, int m, int k, int n, int ldc,
-                                          int32_t out_zp, int32_t *mult, int32_t *shift);
+                                          int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                          int32_t *mult, int32_t *shift);
 void shl_rvv_reorder_input_z12_packn_int8_dot(int8_t *b, int8_t *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                           int32_t *bias, int m, int k, int n, int ldc,
-                                           int32_t out_zp, int32_t *mult, int32_t *shift);
+                                           int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                           int32_t *mult, int32_t *shift);
 
 void shl_rvv_reorder_input_z8_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_8xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb,
@@ -464,7 +694,7 @@ void shl_rvv_reorder_input_z12_pack1ton_int8_dot(int8_t *b, int8_t *sb, int inc,
 
 void shl_rvv_reorder_input_z4_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
 void shl_rvv_ncxhwx_gemm_4xpack2n_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                       int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                       int32_t *bias, int m, int k, int n, int32_t out_zp,
                                        int32_t *mult, int32_t *shift);
 
 /************************************ gemm block **********************************/
@@ -671,6 +901,8 @@ int shl_rvv_relu6_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_relu_params *params);
 int shl_rvv_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_relu_params *params);
+int shl_rvv_relu6_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
 
 int shl_rvv_leaky_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_relu_params *params);
@@ -683,11 +915,15 @@ int shl_rvv_sigmoid_fp32(struct csinn_tensor *input, struct csinn_tensor *output
                          struct csinn_sigmoid_params *params);
 int shl_rvv_sigmoid_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_sigmoid_params *params);
+int shl_rvv_sigmoid_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params);
 
 int shl_rvv_softmax_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_softmax_params *params);
 int shl_rvv_softmax_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_softmax_params *params);
+int shl_rvv_softmax_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params);
 
 int shl_rvv_prelu_fp32(struct csinn_tensor *input, struct csinn_tensor *alpha,
                        struct csinn_tensor *output, struct csinn_prelu_params *params);
@@ -725,6 +961,8 @@ int shl_rvv_transpose_fp16(struct csinn_tensor *input, struct csinn_tensor *outp
 int shl_rvv_transpose_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_transpose_params *params);
 
+int shl_rvv_gather_fp32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                        struct csinn_tensor *output, struct csinn_gather_params *params);
 int shl_rvv_gather_fp16(struct csinn_tensor *input, struct csinn_tensor *indices,
                         struct csinn_tensor *output, struct csinn_gather_params *params);
 int shl_rvv_gather_int8(struct csinn_tensor *input, struct csinn_tensor *indices,
@@ -741,6 +979,13 @@ int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
 int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params);
 
+int shl_rvv_sub_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_sub_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_sub_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
 int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params);
 int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
@@ -748,9 +993,23 @@ int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
 int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params);
 
+int shl_rvv_div_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_div_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_div_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
 int shl_rvv_reduce_sum_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_reduce_params *params);
 
+int shl_rvv_erf_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+int shl_rvv_erf_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+int shl_rvv_erf_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
 /******************************** normalization *****************************/
 int shl_rvv_layer_norm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_tensor *gamma, struct csinn_tensor *beta,
@@ -769,6 +1028,7 @@ void shl_rvv_matmul_reorder_weight_fp16(struct csinn_tensor *mat1, const int K_B
                                         const int N_BLK);
 void shl_rvv_matmul_reorder_weight_fp16_w_int8(struct csinn_tensor *mat1, const int K_BLK,
                                                const int N_BLK);
+void shl_rvv_matmul_reorder_weight_int8(struct csinn_tensor *mat1);
 
 int shl_rvv_matmul_block_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                               struct csinn_tensor *output, struct csinn_matmul_params *params,
@@ -780,6 +1040,25 @@ int shl_rvv_matmul_block_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_ten
                                      struct csinn_tensor *output,
                                      struct csinn_matmul_params *params, const int M_BLK,
                                      const int K_BLK, const int N_BLK);
+int shl_rvv_matmul_common_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                               struct csinn_tensor *output, struct csinn_matmul_params *params,
+                               void (*reorder_mat0)(int8_t *, int8_t *, int, int, int),
+                               void (*reorder_mat1)(int8_t *, int8_t *, int, int, int),
+                               void (*matmul)(int8_t *, const int8_t *, const int8_t *, int, int,
+                                              int, int, int32_t, int32_t, int32_t, int32_t,
+                                              int32_t));
+
+void shl_rvv_matmul_reorder_mat0_n4_int8(int8_t *src, int8_t *dst, int m, int k, int lda);
+void shl_rvv_matmul_reorder_mat1_zpackn_int8(int8_t *src, int8_t *dst, int k, int n, int ldb);
+void shl_rvv_matmul_4xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
+                                 int n, int ldc, int32_t z1, int32_t z2, int32_t z3, int32_t mult,
+                                 int32_t shift);
+
+void shl_rvv_matmul_reorder_mat0_n8z4_int8_dot(int8_t *src, int8_t *dst, int m, int k, int lda);
+void shl_rvv_matmul_reorder_mat1_zmf2n4_int8_dot(int8_t *src, int8_t *dst, int k, int n, int ldb);
+void shl_rvv_matmul_8xmf2_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
+                                   int n, int ldc, int32_t z1, int32_t z2, int32_t z3, int32_t mult,
+                                   int32_t shift);
 
 int shl_rvv_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                         struct csinn_tensor *output, struct csinn_matmul_params *params);
@@ -832,6 +1111,7 @@ void shl_rvv_reorder_input_packnto1_int8(const int8_t *src, int8_t *dst, int inc
 
 void shl_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size);
 
+void shl_rvv_requantize_fp16(__fp16 *src, __fp16 scale, int size);
 void shl_rvv_sidcso_op_requantize_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_tensor *kernel);
 void shl_rvv_siso_op_requantize_fp16(struct csinn_tensor *input, struct csinn_tensor *output);
@@ -841,6 +1121,7 @@ void shl_rvv_diso_op_requantize_fp16(struct csinn_tensor *input0, struct csinn_t
 void shl_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size);
 
 void shl_rvv_dequantize_i8_to_f16(int8_t *src, __fp16 *dst, int size, int32_t zp, float scale);
+vfloat16m2_t shl_rvv_vdeq_vv_f16m2(vint8m1_t _i8, vint8m1_t _z, vfloat16m2_t _s, int vl);
 
 void shl_rvv_reorder_kernel_n8_fp16_w_int8(int8_t *a, int8_t *sa, int m, int k, int ldx);
 
@@ -852,6 +1133,19 @@ void shl_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size);
 void shl_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size);
 void shl_rvv_saturated_int4(int32_t *src, int8_t *dst, int32_t out_zp, int size);
 
+int shl_rvv_tensor_data_convert(struct csinn_tensor *src, struct csinn_tensor *dst);
+void shl_rvv_u8_to_i16(const uint8_t *input, int16_t *output, int32_t z1, float *s1, int32_t z2,
+                       float *s2, uint32_t length);
+void shl_rvv_i16_to_u8(const int16_t *input, uint8_t *output, int32_t z1, float *s1, int32_t z2,
+                       float *s2, uint32_t length);
+void shl_rvv_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale,
+                       uint32_t length);
+void shl_rvv_f32_to_u8(const float *input, uint8_t *output, int32_t offset, float *scale,
+                       uint32_t length);
+void shl_rvv_i8_to_f32(const int8_t *input, float *output, int32_t offset, float *scale,
+                       uint32_t length);
+void shl_rvv_f32_to_i8(const float *input, int8_t *output, int32_t offset, float *scale,
+                       uint32_t length);
 void shl_rvv_i16_to_f32(const int16_t *input, float *output, int32_t offset, float *scale,
                         uint32_t length);
 void shl_rvv_f32_to_i16(const float *input, int16_t *output, int32_t offset, float *scale,
@@ -865,6 +1159,10 @@ void shl_rvv_f32_to_i64(const float *input, int64_t *output, uint32_t length);
 void shl_rvv_f16_to_f32(const __fp16 *input, float *output, float *scale, uint32_t length);
 void shl_rvv_f32_to_f16(const float *input, __fp16 *output, float *scale, uint32_t length);
 
+struct csinn_tensor *shl_rvv_tensor_transform_f32(struct csinn_tensor *input);
+int shl_rvv_siso_callback_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                               void *params, void *cb);
+
 int shl_rvv_data_convert_int8_to_int4(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_siso_params *params);
 int shl_rvv_data_convert_int4_to_int8(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -924,6 +1222,13 @@ int shl_rvv_transpose_get_in_index(int32_t *dim, int32_t *idx, int32_t dim_count
 int shl_rvv_transpose_get_out_index(int32_t *dim, int32_t *idx, int32_t *permute,
                                     int32_t dim_count);
 
+int shl_rvv_binary_op_broadcast_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                     struct csinn_tensor *output, void *binary_op_callback[]);
+int shl_rvv_binary_op_broadcast_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                     struct csinn_tensor *output, void *binary_op_callback[]);
+int shl_rvv_binary_op_broadcast_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                     struct csinn_tensor *output, void *binary_op_callback[]);
+
 #ifdef SHL_USE_DOT_INT4
 int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/include/shl_tvmgen.h b/include/backend/tvmgen/shl_tvmgen.h
similarity index 98%
rename from include/shl_tvmgen.h
rename to include/backend/tvmgen/shl_tvmgen.h
index b9f99b82..20e51de5 100644
--- a/include/shl_tvmgen.h
+++ b/include/backend/tvmgen/shl_tvmgen.h
@@ -24,8 +24,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "csi_nn.h"
 #include "shl_node.h"
+#include "shl_utils.h"
 
 struct shl_tvmgen_name_func {
     char *name;
diff --git a/include/csi_nn.h b/include/csinn/csi_nn.h
similarity index 99%
rename from include/csi_nn.h
rename to include/csinn/csi_nn.h
index 7fa86cc7..2b40a8b7 100644
--- a/include/csi_nn.h
+++ b/include/csinn/csi_nn.h
@@ -30,8 +30,6 @@
 
 #include "csinn_data_structure.h"
 #include "csinn_runtime.h"
-#include "shl_debug.h"
-#include "shl_memory.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/csinn_data_structure.h b/include/csinn/csinn_data_structure.h
similarity index 99%
rename from include/csinn_data_structure.h
rename to include/csinn/csinn_data_structure.h
index 24982ab9..8caa539f 100644
--- a/include/csinn_data_structure.h
+++ b/include/csinn/csinn_data_structure.h
@@ -102,6 +102,7 @@ enum csinn_api_enum {
     CSINN_RVV,      /**< RISC-V V extension general platform */
     CSINN_RVM,      /**< RISC-V Matrix extension general platform */
     CSINN_E907,     /**< E907 CPU platform */
+    CSINN_C920V2,   /**< C920V2 CPU platform */
     CSINN_API_SIZE,
 };
 
@@ -419,6 +420,9 @@ enum csinn_layout_enum {
 
     // for 6D shape
     CSINN_LAYOUT_NLCDHW, /**< NCHW input and output, 6 dimensions */
+
+    // for deconv2d weight shape
+    CSINN_LAYOUT_IOHW, /**< NCHW input and output, 4 dimension */
 };
 
 /** CSI-NN return type */
@@ -463,6 +467,13 @@ enum csinn_debug_enum {
     CSINN_DEBUG_LEVEL_FATAL,      /**< program crash */
 };
 
+/** CSI-NN broadcast callback type */
+enum csinn_broadcast_type_enum {
+    CSINN_BROADCAST_VV = 0, /**< Vector-vector */
+    CSINN_BROADCAST_VS,     /**< Vector-scalar */
+    CSINN_BROADCAST_SV,     /**< Scalar-vector */
+};
+
 /** CSI-NN quantization information */
 struct csinn_quant_info {
     int32_t zero_point; /**< Zero point value */
diff --git a/include/csinn_runtime.h b/include/csinn/csinn_runtime.h
similarity index 100%
rename from include/csinn_runtime.h
rename to include/csinn/csinn_runtime.h
diff --git a/include/shl_gref.h b/include/graph/shl_gref.h
similarity index 99%
rename from include/shl_gref.h
rename to include/graph/shl_gref.h
index 6f06d684..17478b11 100644
--- a/include/shl_gref.h
+++ b/include/graph/shl_gref.h
@@ -1024,6 +1024,7 @@ int shl_gref_segment_infer_shape(struct csinn_tensor *input0, struct csinn_tenso
                                  struct csinn_tensor *output, struct csinn_segment_params *params);
 int shl_gref_stride_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_reduce_params *params);
+void shl_tensor_try_nc1xc0_to_ndarray_shape(struct csinn_tensor *t);
 
 int shl_gref_call_layer_func(void *fn, struct shl_node *node);
 struct csinn_callback *shl_gref_best_callback(struct shl_node *node);
diff --git a/include/shl_node.h b/include/graph/shl_node.h
similarity index 100%
rename from include/shl_node.h
rename to include/graph/shl_node.h
diff --git a/include/shl_debug.h b/include/shl_debug.h
index ecf4206c..2c08bd78 100644
--- a/include/shl_debug.h
+++ b/include/shl_debug.h
@@ -17,8 +17,8 @@
  */
 #ifndef INCLUDE_SHL_DEBUG_H_
 #define INCLUDE_SHL_DEBUG_H_
-#include "csi_nn.h"
-#include "shl_node.h"
+#include "csinn/csi_nn.h"
+#include "graph/shl_node.h"
 
 enum shl_debug_enum {
     SHL_DEBUG_LEVEL_DEBUG = -2,
diff --git a/include/shl_public/shl_c906.h b/include/shl_public/shl_c906.h
new file mode 100644
index 00000000..39443769
--- /dev/null
+++ b/include/shl_public/shl_c906.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_C906_H_
+#define INCLUDE_SHL_C906_H_
+
+#include "csi_nn.h"
+
+void shl_c906_reset_fcsr();
+int shl_c906_get_fcsr();
+
+/* hardware performance */
+struct shl_c906_hpm {
+    size_t inst;
+    size_t cycle;
+    size_t l1_icache_access;
+    size_t l1_icache_miss;
+    size_t store_inst;
+    size_t l1_dcache_raccess;
+    size_t l1_dcache_rmiss;
+    size_t l1_dcache_waccess;
+    size_t l1_dcache_wmiss;
+};
+
+uint64_t shl_c906_get_inst();
+uint64_t shl_c906_get_cycle();
+uint64_t shl_c906_get_l1_icache_access();
+uint64_t shl_c906_get_l1_icache_miss();
+uint64_t shl_c906_get_cb_miss();
+uint64_t shl_c906_get_cb_inst();
+uint64_t shl_c906_get_store_inst();
+uint64_t shl_c906_get_l1_dcache_raccess();
+uint64_t shl_c906_get_l1_dcache_rmiss();
+uint64_t shl_c906_get_l1_dcache_waccess();
+uint64_t shl_c906_get_l1_dcache_wmiss();
+
+struct shl_c906_hpm shl_c906_get_hw_perf();
+
+int shl_c906_reduce_sum_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+void shl_c906_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale,
+                        uint32_t length);
+void shl_c906_i8_to_f32(const int8_t *input, float *output, int32_t offset, float *scale,
+                        uint32_t length);
+void shl_c906_f32_to_u8(const float *input, uint8_t *output, int32_t offset, float *scale,
+                        uint32_t length);
+void shl_c906_f32_to_i8(const float *input, int8_t *output, int32_t offset, float *scale,
+                        uint32_t length);
+
+#endif  // INCLUDE_SHL_C906_H_
diff --git a/include/shl_public/shl_c920.h b/include/shl_public/shl_c920.h
new file mode 100644
index 00000000..260e66a8
--- /dev/null
+++ b/include/shl_public/shl_c920.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_C920_H_
+#define INCLUDE_SHL_C920_H_
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void shl_c920_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale,
+                        uint32_t length);
+void shl_c920_i8_to_f32(const int8_t *input, float *output, int32_t offset, float *scale,
+                        uint32_t length);
+void shl_c920_f32_to_u8(const float *input, uint8_t *output, int32_t offset, float *scale,
+                        uint32_t length);
+void shl_c920_f32_to_i8(const float *input, int8_t *output, int32_t offset, float *scale,
+                        uint32_t length);
+
+void *shl_c920_f32_to_input_dtype(uint32_t index, float *data, struct csinn_session *sess);
+float *shl_c920_output_to_f32_dtype(uint32_t index, void *data, struct csinn_session *sess);
+
+int shl_c920_detect_yolov5_postprocess(struct csinn_tensor **input_tensors,
+                                       struct shl_yolov5_box *out,
+                                       struct shl_yolov5_params *params);
+int shl_c920_yolox_preprocess(struct csinn_tensor *input, struct csinn_tensor *output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_SHL_C920_H_
diff --git a/include/shl_public/shl_pnna.h b/include/shl_public/shl_pnna.h
new file mode 100644
index 00000000..fa39d1e8
--- /dev/null
+++ b/include/shl_public/shl_pnna.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_PNNA_H_
+#define INCLUDE_SHL_PNNA_H_
+#include "../csinn/csi_nn.h"
+#include "../shl_utils.h"
+
+int shl_pnna_set_input_strides(struct csinn_session *sess, int input_byte_size, int input_fix_h,
+                               int input_fix_w);
+
+#endif  // INCLUDE_SHL_PNNA_H_
diff --git a/include/shl_public/shl_ref.h b/include/shl_public/shl_ref.h
new file mode 100644
index 00000000..fc424539
--- /dev/null
+++ b/include/shl_public/shl_ref.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_REF_H_
+#define INCLUDE_SHL_REF_H_
+
+#include "../csinn/csi_nn.h"
+#include "../shl_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output);
+void shl_ref_nn_deinit(struct csinn_tensor *input, struct csinn_tensor *output);
+struct csinn_tensor *shl_ref_alloc_float_tensor(struct csinn_tensor *src);
+void shl_ref_free_float_tensor(struct csinn_tensor *src);
+struct csinn_tensor *shl_ref_convert_float_tensor(struct csinn_tensor *src);
+void shl_ref_conv_free_float_tensor(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias);
+struct csinn_tensor *shl_ref_tensor_transform_f32(struct csinn_tensor *input);
+struct csinn_tensor *shl_ref_tensor_transform_int64(struct csinn_tensor *input);
+int shl_ref_tensor_transform_free_f32(struct csinn_tensor *input);
+int shl_ref_tensor_transform_free_int64(struct csinn_tensor *input);
+uint8_t *shl_ref_f32_to_input_dtype(uint32_t index, float *data, struct csinn_session *sess);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_SHL_REF_H_
diff --git a/include/shl_utils.h b/include/shl_utils.h
index b3e3cdc6..f78b27e4 100644
--- a/include/shl_utils.h
+++ b/include/shl_utils.h
@@ -29,7 +29,9 @@
 #if (!defined SHL_BUILD_RTOS)
 #include <omp.h>
 #endif
-#include "csinn_data_structure.h"
+#include "csinn/csinn_data_structure.h"
+#include "shl_debug.h"
+#include "shl_memory.h"
 #ifdef SHL_MCONF_CONFIG
 #include "mconf_config.h"
 #endif
@@ -105,7 +107,7 @@ struct shl_binary_model_section_info {
     struct shl_bm_sections sections[127];
 };
 
-char *shl_bm_header_str();
+void shl_bm_header_str(char *buffer);
 
 void shl_dump_bm_header(FILE *f);
 void shl_dump_bm_section_info(FILE *f, struct shl_binary_model_section_info *info);
@@ -113,9 +115,12 @@ int shl_dump_bm_graph_info_section(FILE *f, struct csinn_session *sess);
 void shl_bm_session_load(struct csinn_session *dest, struct csinn_session *src);
 int shl_dump_bm_graph_struct_section(FILE *f, struct shl_ref_graph *graph);
 void shl_bm_graph_struct_load(struct shl_ref_graph *dest, struct shl_ref_graph *src);
-
 bool shl_is_first_layer_input(struct csinn_tensor *input, struct csinn_session *sess);
 
+/** Export model */
+void shl_export_model_print(struct csinn_session *sess);
+int shl_export_model_json(struct csinn_session *sess, char *path);
+
 /** YOLOv5 detect box */
 struct shl_yolov5_box {
     int label;   /**< Object label */
diff --git a/module/json/json.hpp b/module/json/json.hpp
new file mode 100644
index 00000000..a70aaf8c
--- /dev/null
+++ b/module/json/json.hpp
@@ -0,0 +1,25447 @@
+/*
+    __ _____ _____ _____
+ __|  |   __|     |   | |  JSON for Modern C++
+|  |  |__   |  |  | | | |  version 3.9.1
+|_____|_____|_____|_|___|  https://github.com/nlohmann/json
+
+Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+SPDX-License-Identifier: MIT
+Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
+
+Permission is hereby  granted, free of charge, to any  person obtaining a copy
+of this software and associated  documentation files (the "Software"), to deal
+in the Software  without restriction, including without  limitation the rights
+to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
+copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
+IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
+FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
+AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
+LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef INCLUDE_NLOHMANN_JSON_HPP_
+#define INCLUDE_NLOHMANN_JSON_HPP_
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3
+#define NLOHMANN_JSON_VERSION_MINOR 9
+#define NLOHMANN_JSON_VERSION_PATCH 1
+
+#include <algorithm> // all_of, find, for_each
+#include <cstddef> // nullptr_t, ptrdiff_t, size_t
+#include <functional> // hash, less
+#include <initializer_list> // initializer_list
+#include <iosfwd> // istream, ostream
+#include <iterator> // random_access_iterator_tag
+#include <memory> // unique_ptr
+#include <numeric> // accumulate
+#include <string> // string, stoi, to_string
+#include <utility> // declval, forward, move, pair, swap
+#include <vector> // vector
+
+// #include <nlohmann/adl_serializer.hpp>
+
+
+#include <utility>
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+
+#include <algorithm> // transform
+#include <array> // array
+#include <forward_list> // forward_list
+#include <iterator> // inserter, front_inserter, end
+#include <map> // map
+#include <string> // string
+#include <tuple> // tuple, make_tuple
+#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
+#include <unordered_map> // unordered_map
+#include <utility> // pair, declval
+#include <valarray> // valarray
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+
+#include <exception> // exception
+#include <stdexcept> // runtime_error
+#include <string> // to_string
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+
+#include <cstddef> // size_t
+
+namespace nlohmann
+{
+namespace detail
+{
+/// struct to capture the start position of the current token
+struct position_t
+{
+    /// the total number of characters read
+    std::size_t chars_read_total = 0;
+    /// the number of characters read in the current line
+    std::size_t chars_read_current_line = 0;
+    /// the number of lines read
+    std::size_t lines_read = 0;
+
+    /// conversion to size_t to preserve SAX interface
+    constexpr operator size_t() const
+    {
+        return chars_read_total;
+    }
+};
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+#include <utility> // pair
+// #include <nlohmann/thirdparty/hedley/hedley.hpp>
+/* Hedley - https://nemequ.github.io/hedley
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all
+ * copyright and related and neighboring rights to this software to
+ * the public domain worldwide. This software is distributed without
+ * any warranty.
+ *
+ * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ * SPDX-License-Identifier: CC0-1.0
+ */
+
+#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 13)
+#if defined(JSON_HEDLEY_VERSION)
+    #undef JSON_HEDLEY_VERSION
+#endif
+#define JSON_HEDLEY_VERSION 13
+
+#if defined(JSON_HEDLEY_STRINGIFY_EX)
+    #undef JSON_HEDLEY_STRINGIFY_EX
+#endif
+#define JSON_HEDLEY_STRINGIFY_EX(x) #x
+
+#if defined(JSON_HEDLEY_STRINGIFY)
+    #undef JSON_HEDLEY_STRINGIFY
+#endif
+#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)
+
+#if defined(JSON_HEDLEY_CONCAT_EX)
+    #undef JSON_HEDLEY_CONCAT_EX
+#endif
+#define JSON_HEDLEY_CONCAT_EX(a,b) a##b
+
+#if defined(JSON_HEDLEY_CONCAT)
+    #undef JSON_HEDLEY_CONCAT
+#endif
+#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)
+
+#if defined(JSON_HEDLEY_CONCAT3_EX)
+    #undef JSON_HEDLEY_CONCAT3_EX
+#endif
+#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c
+
+#if defined(JSON_HEDLEY_CONCAT3)
+    #undef JSON_HEDLEY_CONCAT3
+#endif
+#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c)
+
+#if defined(JSON_HEDLEY_VERSION_ENCODE)
+    #undef JSON_HEDLEY_VERSION_ENCODE
+#endif
+#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
+    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
+
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #undef JSON_HEDLEY_GNUC_VERSION
+#endif
+#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#elif defined(__GNUC__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION)
+    #undef JSON_HEDLEY_MSVC_VERSION
+#endif
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
+#elif defined(_MSC_FULL_VER)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
+#elif defined(_MSC_VER)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
+    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#endif
+#if !defined(_MSC_VER)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
+#else
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #undef JSON_HEDLEY_INTEL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
+#elif defined(__INTEL_COMPILER)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
+    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #undef JSON_HEDLEY_PGI_VERSION
+#endif
+#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
+    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
+    #undef JSON_HEDLEY_PGI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #undef JSON_HEDLEY_SUNPRO_VERSION
+#endif
+#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
+#elif defined(__SUNPRO_C)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
+#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
+#elif defined(__SUNPRO_CC)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
+    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#endif
+#if defined(__EMSCRIPTEN__)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #undef JSON_HEDLEY_ARM_VERSION
+#endif
+#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
+#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
+    #undef JSON_HEDLEY_ARM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #undef JSON_HEDLEY_IBM_VERSION
+#endif
+#if defined(__ibmxl__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
+#elif defined(__xlC__) && defined(__xlC_ver__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
+#elif defined(__xlC__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
+    #undef JSON_HEDLEY_IBM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #undef JSON_HEDLEY_TI_VERSION
+#endif
+#if \
+    defined(__TI_COMPILER_VERSION__) && \
+    ( \
+      defined(__TMS470__) || defined(__TI_ARM__) || \
+      defined(__MSP430__) || \
+      defined(__TMS320C2000__) \
+    )
+#if (__TI_COMPILER_VERSION__ >= 16000000)
+    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
+    #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #undef JSON_HEDLEY_TI_CL430_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
+    #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
+    #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
+    #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
+    #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #undef JSON_HEDLEY_CRAY_VERSION
+#endif
+#if defined(_CRAYC)
+    #if defined(_RELEASE_PATCHLEVEL)
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
+    #else
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
+    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #undef JSON_HEDLEY_IAR_VERSION
+#endif
+#if defined(__IAR_SYSTEMS_ICC__)
+    #if __VER__ > 1000
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
+    #else
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
+    #undef JSON_HEDLEY_IAR_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #undef JSON_HEDLEY_TINYC_VERSION
+#endif
+#if defined(__TINYC__)
+    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
+    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #undef JSON_HEDLEY_DMC_VERSION
+#endif
+#if defined(__DMC__)
+    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
+    #undef JSON_HEDLEY_DMC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #undef JSON_HEDLEY_COMPCERT_VERSION
+#endif
+#if defined(__COMPCERT_VERSION__)
+    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
+    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #undef JSON_HEDLEY_PELLES_VERSION
+#endif
+#if defined(__POCC__)
+    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
+    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #undef JSON_HEDLEY_GCC_VERSION
+#endif
+#if \
+    defined(JSON_HEDLEY_GNUC_VERSION) && \
+    !defined(__clang__) && \
+    !defined(JSON_HEDLEY_INTEL_VERSION) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_ARM_VERSION) && \
+    !defined(JSON_HEDLEY_TI_VERSION) && \
+    !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL430_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \
+    !defined(__COMPCERT__)
+    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#endif
+#if \
+    defined(__has_cpp_attribute) && \
+    defined(__cplusplus) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#endif
+#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#elif \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_IAR_VERSION) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
+    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_BUILTIN)
+    #undef JSON_HEDLEY_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_FEATURE)
+    #undef JSON_HEDLEY_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GCC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_EXTENSION)
+    #undef JSON_HEDLEY_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_WARNING)
+    #undef JSON_HEDLEY_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
+    #undef JSON_HEDLEY_GNUC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
+    #undef JSON_HEDLEY_GCC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
+#    if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
+#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    else
+#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    endif
+#  endif
+#endif
+#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
+#endif
+
+#if defined(JSON_HEDLEY_CONST_CAST)
+    #undef JSON_HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif \
+  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_REINTERPRET_CAST)
+    #undef JSON_HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_CAST)
+    #undef JSON_HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_CPP_CAST)
+    #undef JSON_HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
+    ((T) (expr)) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0)
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("diag_suppress=Pe137") \
+    JSON_HEDLEY_DIAGNOSTIC_POP \
+#  else
+#    define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr))
+#  endif
+#else
+#  define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    defined(__clang__) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
+    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
+#else
+    #define JSON_HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
+    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
+    #undef JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
+    #define JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+
+#if defined(JSON_HEDLEY_DEPRECATED)
+    #undef JSON_HEDLEY_DEPRECATED
+#endif
+#if defined(JSON_HEDLEY_DEPRECATED_FOR)
+    #undef JSON_HEDLEY_DEPRECATED_FOR
+#endif
+#if JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
+    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
+#elif \
+    JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
+#else
+    #define JSON_HEDLEY_DEPRECATED(since)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
+#endif
+
+#if defined(JSON_HEDLEY_UNAVAILABLE)
+    #undef JSON_HEDLEY_UNAVAILABLE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
+#else
+    #define JSON_HEDLEY_UNAVAILABLE(available_since)
+#endif
+
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#endif
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#endif
+#if (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
+#elif defined(_Check_return_) /* SAL */
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
+#else
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
+#endif
+
+#if defined(JSON_HEDLEY_SENTINEL)
+    #undef JSON_HEDLEY_SENTINEL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0)
+    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
+#else
+    #define JSON_HEDLEY_SENTINEL(position)
+#endif
+
+#if defined(JSON_HEDLEY_NO_RETURN)
+    #undef JSON_HEDLEY_NO_RETURN
+#endif
+#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NO_RETURN __noreturn
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+    #define JSON_HEDLEY_NO_RETURN _Noreturn
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#else
+    #define JSON_HEDLEY_NO_RETURN
+#endif
+
+#if defined(JSON_HEDLEY_NO_ESCAPE)
+    #undef JSON_HEDLEY_NO_ESCAPE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
+    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
+#else
+    #define JSON_HEDLEY_NO_ESCAPE
+#endif
+
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #undef JSON_HEDLEY_UNREACHABLE
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
+    #undef JSON_HEDLEY_UNREACHABLE_RETURN
+#endif
+#if defined(JSON_HEDLEY_ASSUME)
+    #undef JSON_HEDLEY_ASSUME
+#endif
+#if \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
+#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
+    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
+#elif \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+    #if defined(__cplusplus)
+        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
+    #endif
+#endif
+#if \
+    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5)
+    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif defined(JSON_HEDLEY_ASSUME)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+#if !defined(JSON_HEDLEY_ASSUME)
+    #if defined(JSON_HEDLEY_UNREACHABLE)
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
+    #endif
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #if  \
+        JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
+    #else
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
+    #endif
+#else
+    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
+#endif
+#if !defined(JSON_HEDLEY_UNREACHABLE)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+
+JSON_HEDLEY_DIAGNOSTIC_PUSH
+#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
+    #pragma clang diagnostic ignored "-Wpedantic"
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
+    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
+    #if defined(__clang__)
+        #pragma clang diagnostic ignored "-Wvariadic-macros"
+    #elif defined(JSON_HEDLEY_GCC_VERSION)
+        #pragma GCC diagnostic ignored "-Wvariadic-macros"
+    #endif
+#endif
+#if defined(JSON_HEDLEY_NON_NULL)
+    #undef JSON_HEDLEY_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
+#else
+    #define JSON_HEDLEY_NON_NULL(...)
+#endif
+JSON_HEDLEY_DIAGNOSTIC_POP
+
+#if defined(JSON_HEDLEY_PRINTF_FORMAT)
+    #undef JSON_HEDLEY_PRINTF_FORMAT
+#endif
+#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
+#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
+#else
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
+#endif
+
+#if defined(JSON_HEDLEY_CONSTEXPR)
+    #undef JSON_HEDLEY_CONSTEXPR
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
+    #endif
+#endif
+#if !defined(JSON_HEDLEY_CONSTEXPR)
+    #define JSON_HEDLEY_CONSTEXPR
+#endif
+
+#if defined(JSON_HEDLEY_PREDICT)
+    #undef JSON_HEDLEY_PREDICT
+#endif
+#if defined(JSON_HEDLEY_LIKELY)
+    #undef JSON_HEDLEY_LIKELY
+#endif
+#if defined(JSON_HEDLEY_UNLIKELY)
+    #undef JSON_HEDLEY_UNLIKELY
+#endif
+#if defined(JSON_HEDLEY_UNPREDICTABLE)
+    #undef JSON_HEDLEY_UNPREDICTABLE
+#endif
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
+#endif
+#if \
+  JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0)
+#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
+#  define JSON_HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
+#  define JSON_HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
+#elif \
+  JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
+  JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0)
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
+    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
+#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
+#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
+#endif
+#if !defined(JSON_HEDLEY_UNPREDICTABLE)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
+#endif
+
+#if defined(JSON_HEDLEY_MALLOC)
+    #undef JSON_HEDLEY_MALLOC
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
+    #define JSON_HEDLEY_MALLOC __declspec(restrict)
+#else
+    #define JSON_HEDLEY_MALLOC
+#endif
+
+#if defined(JSON_HEDLEY_PURE)
+    #undef JSON_HEDLEY_PURE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+#  define JSON_HEDLEY_PURE __attribute__((__pure__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
+    )
+#  define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
+#else
+#  define JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_CONST)
+    #undef JSON_HEDLEY_CONST
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_CONST __attribute__((__const__))
+#elif \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
+#else
+    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_RESTRICT)
+    #undef JSON_HEDLEY_RESTRICT
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT restrict
+#elif \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    defined(__clang__)
+    #define JSON_HEDLEY_RESTRICT __restrict
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT _Restrict
+#else
+    #define JSON_HEDLEY_RESTRICT
+#endif
+
+#if defined(JSON_HEDLEY_INLINE)
+    #undef JSON_HEDLEY_INLINE
+#endif
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    (defined(__cplusplus) && (__cplusplus >= 199711L))
+    #define JSON_HEDLEY_INLINE inline
+#elif \
+    defined(JSON_HEDLEY_GCC_VERSION) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
+    #define JSON_HEDLEY_INLINE __inline__
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_INLINE __inline
+#else
+    #define JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_ALWAYS_INLINE)
+    #undef JSON_HEDLEY_ALWAYS_INLINE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __forceinline
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+      JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+      JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
+    )
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
+#else
+#  define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_NEVER_INLINE)
+    #undef JSON_HEDLEY_NEVER_INLINE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#else
+    #define JSON_HEDLEY_NEVER_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_PRIVATE)
+    #undef JSON_HEDLEY_PRIVATE
+#endif
+#if defined(JSON_HEDLEY_PUBLIC)
+    #undef JSON_HEDLEY_PUBLIC
+#endif
+#if defined(JSON_HEDLEY_IMPORT)
+    #undef JSON_HEDLEY_IMPORT
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  define JSON_HEDLEY_PRIVATE
+#  define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
+#  define JSON_HEDLEY_IMPORT   __declspec(dllimport)
+#else
+#  if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    ( \
+      defined(__TI_EABI__) && \
+      ( \
+        (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
+      ) \
+    )
+#    define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
+#    define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
+#  else
+#    define JSON_HEDLEY_PRIVATE
+#    define JSON_HEDLEY_PUBLIC
+#  endif
+#  define JSON_HEDLEY_IMPORT    extern
+#endif
+
+#if defined(JSON_HEDLEY_NO_THROW)
+    #undef JSON_HEDLEY_NO_THROW
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
+#else
+    #define JSON_HEDLEY_NO_THROW
+#endif
+
+#if defined(JSON_HEDLEY_FALL_THROUGH)
+    #undef JSON_HEDLEY_FALL_THROUGH
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0)
+    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
+#elif defined(__fallthrough) /* SAL */
+    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
+#else
+    #define JSON_HEDLEY_FALL_THROUGH
+#endif
+
+#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
+    #undef JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
+#elif defined(_Ret_notnull_) /* SAL */
+    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
+#else
+    #define JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+
+#if defined(JSON_HEDLEY_ARRAY_PARAM)
+    #undef JSON_HEDLEY_ARRAY_PARAM
+#endif
+#if \
+    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
+    !defined(__STDC_NO_VLA__) && \
+    !defined(__cplusplus) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
+#else
+    #define JSON_HEDLEY_ARRAY_PARAM(name)
+#endif
+
+#if defined(JSON_HEDLEY_IS_CONSTANT)
+    #undef JSON_HEDLEY_IS_CONSTANT
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
+    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#endif
+/* JSON_HEDLEY_IS_CONSTEXPR_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #undef JSON_HEDLEY_IS_CONSTEXPR_
+#endif
+#if \
+    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0)
+    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
+#endif
+#if !defined(__cplusplus)
+#  if \
+       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
+#endif
+#  elif \
+       ( \
+          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+          !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
+          !defined(JSON_HEDLEY_PGI_VERSION) && \
+          !defined(JSON_HEDLEY_IAR_VERSION)) || \
+       JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
+#endif
+#  elif \
+       defined(JSON_HEDLEY_GCC_VERSION) || \
+       defined(JSON_HEDLEY_INTEL_VERSION) || \
+       defined(JSON_HEDLEY_TINYC_VERSION) || \
+       defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \
+       JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
+       defined(JSON_HEDLEY_TI_CL2000_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL6X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL7X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \
+       defined(__clang__)
+#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
+        sizeof(void) != \
+        sizeof(*( \
+                  1 ? \
+                  ((void*) ((expr) * 0L) ) : \
+((struct { char v[sizeof(void) * 2]; } *) 1) \
+                ) \
+              ) \
+                                            )
+#  endif
+#endif
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
+#else
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
+    #undef JSON_HEDLEY_BEGIN_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_END_C_DECLS)
+    #undef JSON_HEDLEY_END_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_C_DECL)
+    #undef JSON_HEDLEY_C_DECL
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
+    #define JSON_HEDLEY_END_C_DECLS }
+    #define JSON_HEDLEY_C_DECL extern "C"
+#else
+    #define JSON_HEDLEY_BEGIN_C_DECLS
+    #define JSON_HEDLEY_END_C_DECLS
+    #define JSON_HEDLEY_C_DECL
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_ASSERT)
+    #undef JSON_HEDLEY_STATIC_ASSERT
+#endif
+#if \
+  !defined(__cplusplus) && ( \
+      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+      JSON_HEDLEY_HAS_FEATURE(c_static_assert) || \
+      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
+      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+      defined(_Static_assert) \
+    )
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif \
+  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0)
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
+#else
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
+#endif
+
+#if defined(JSON_HEDLEY_NULL)
+    #undef JSON_HEDLEY_NULL
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
+    #elif defined(NULL)
+        #define JSON_HEDLEY_NULL NULL
+    #else
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
+    #endif
+#elif defined(NULL)
+    #define JSON_HEDLEY_NULL NULL
+#else
+    #define JSON_HEDLEY_NULL ((void*) 0)
+#endif
+
+#if defined(JSON_HEDLEY_MESSAGE)
+    #undef JSON_HEDLEY_MESSAGE
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_MESSAGE(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(message msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
+#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_WARNING)
+    #undef JSON_HEDLEY_WARNING
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_WARNING(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(clang warning msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_REQUIRE)
+    #undef JSON_HEDLEY_REQUIRE
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_MSG)
+    #undef JSON_HEDLEY_REQUIRE_MSG
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
+#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
+#    define JSON_HEDLEY_REQUIRE(expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), msg, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
+#  endif
+#else
+#  define JSON_HEDLEY_REQUIRE(expr)
+#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS)
+    #undef JSON_HEDLEY_FLAGS
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum)
+    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS_CAST)
+    #undef JSON_HEDLEY_FLAGS_CAST
+#endif
+#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        _Pragma("warning(disable:188)") \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
+#endif
+
+#if defined(JSON_HEDLEY_EMPTY_BASES)
+    #undef JSON_HEDLEY_EMPTY_BASES
+#endif
+#if JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)
+    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+    #define JSON_HEDLEY_EMPTY_BASES
+#endif
+
+/* Remaining macros are deprecated. */
+
+#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
+#else
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
+    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#endif
+#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
+    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
+    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#endif
+#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
+    #undef JSON_HEDLEY_CLANG_HAS_WARNING
+#endif
+#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)
+
+#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */
+
+
+// This file contains all internal macro definitions
+// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
+
+// exclude unsupported compilers
+#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
+    #if defined(__clang__)
+        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
+            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
+        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #endif
+#endif
+
+// C++ language standard detection
+#if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+    #define JSON_HAS_CPP_20
+    #define JSON_HAS_CPP_17
+    #define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+    #define JSON_HAS_CPP_17
+    #define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+    #define JSON_HAS_CPP_14
+#endif
+
+// disable float-equal warnings on GCC/clang
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+// disable documentation warnings on clang
+#if defined(__clang__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wdocumentation"
+#endif
+
+// allow to disable exceptions
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
+    #define JSON_THROW(exception) throw exception
+    #define JSON_TRY try
+    #define JSON_CATCH(exception) catch(exception)
+    #define JSON_INTERNAL_CATCH(exception) catch(exception)
+#else
+    #include <cstdlib>
+    #define JSON_THROW(exception) std::abort()
+    #define JSON_TRY if(true)
+    #define JSON_CATCH(exception) if(false)
+    #define JSON_INTERNAL_CATCH(exception) if(false)
+#endif
+
+// override exception macros
+#if defined(JSON_THROW_USER)
+    #undef JSON_THROW
+    #define JSON_THROW JSON_THROW_USER
+#endif
+#if defined(JSON_TRY_USER)
+    #undef JSON_TRY
+    #define JSON_TRY JSON_TRY_USER
+#endif
+#if defined(JSON_CATCH_USER)
+    #undef JSON_CATCH
+    #define JSON_CATCH JSON_CATCH_USER
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
+#endif
+#if defined(JSON_INTERNAL_CATCH_USER)
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
+#endif
+
+// allow to override assert
+#if !defined(JSON_ASSERT)
+    #include <cassert> // assert
+    #define JSON_ASSERT(x) assert(x)
+#endif
+
+/*!
+@brief macro to briefly define a mapping between an enum and JSON
+@def NLOHMANN_JSON_SERIALIZE_ENUM
+@since version 3.4.0
+*/
+#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
+    template<typename BasicJsonType>                                                            \
+    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
+        {                                                                                       \
+            return ej_pair.first == e;                                                          \
+        });                                                                                     \
+        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
+    }                                                                                           \
+    template<typename BasicJsonType>                                                            \
+    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
+        {                                                                                       \
+            return ej_pair.second == j;                                                         \
+        });                                                                                     \
+        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
+    }
+
+// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
+// may be removed in the future once the class is split.
+
+#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
+    template<template<typename, typename, typename...> class ObjectType,   \
+             template<typename, typename...> class ArrayType,              \
+             class StringType, class BooleanType, class NumberIntegerType, \
+             class NumberUnsignedType, class NumberFloatType,              \
+             template<typename> class AllocatorType,                       \
+             template<typename, typename = void> class JSONSerializer,     \
+             class BinaryType>
+
+#define NLOHMANN_BASIC_JSON_TPL                                            \
+    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
+    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
+    AllocatorType, JSONSerializer, BinaryType>
+
+// Macros to simplify conversion from/to types
+
+#define NLOHMANN_JSON_EXPAND( x ) x
+#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME
+#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \
+        NLOHMANN_JSON_PASTE64, \
+        NLOHMANN_JSON_PASTE63, \
+        NLOHMANN_JSON_PASTE62, \
+        NLOHMANN_JSON_PASTE61, \
+        NLOHMANN_JSON_PASTE60, \
+        NLOHMANN_JSON_PASTE59, \
+        NLOHMANN_JSON_PASTE58, \
+        NLOHMANN_JSON_PASTE57, \
+        NLOHMANN_JSON_PASTE56, \
+        NLOHMANN_JSON_PASTE55, \
+        NLOHMANN_JSON_PASTE54, \
+        NLOHMANN_JSON_PASTE53, \
+        NLOHMANN_JSON_PASTE52, \
+        NLOHMANN_JSON_PASTE51, \
+        NLOHMANN_JSON_PASTE50, \
+        NLOHMANN_JSON_PASTE49, \
+        NLOHMANN_JSON_PASTE48, \
+        NLOHMANN_JSON_PASTE47, \
+        NLOHMANN_JSON_PASTE46, \
+        NLOHMANN_JSON_PASTE45, \
+        NLOHMANN_JSON_PASTE44, \
+        NLOHMANN_JSON_PASTE43, \
+        NLOHMANN_JSON_PASTE42, \
+        NLOHMANN_JSON_PASTE41, \
+        NLOHMANN_JSON_PASTE40, \
+        NLOHMANN_JSON_PASTE39, \
+        NLOHMANN_JSON_PASTE38, \
+        NLOHMANN_JSON_PASTE37, \
+        NLOHMANN_JSON_PASTE36, \
+        NLOHMANN_JSON_PASTE35, \
+        NLOHMANN_JSON_PASTE34, \
+        NLOHMANN_JSON_PASTE33, \
+        NLOHMANN_JSON_PASTE32, \
+        NLOHMANN_JSON_PASTE31, \
+        NLOHMANN_JSON_PASTE30, \
+        NLOHMANN_JSON_PASTE29, \
+        NLOHMANN_JSON_PASTE28, \
+        NLOHMANN_JSON_PASTE27, \
+        NLOHMANN_JSON_PASTE26, \
+        NLOHMANN_JSON_PASTE25, \
+        NLOHMANN_JSON_PASTE24, \
+        NLOHMANN_JSON_PASTE23, \
+        NLOHMANN_JSON_PASTE22, \
+        NLOHMANN_JSON_PASTE21, \
+        NLOHMANN_JSON_PASTE20, \
+        NLOHMANN_JSON_PASTE19, \
+        NLOHMANN_JSON_PASTE18, \
+        NLOHMANN_JSON_PASTE17, \
+        NLOHMANN_JSON_PASTE16, \
+        NLOHMANN_JSON_PASTE15, \
+        NLOHMANN_JSON_PASTE14, \
+        NLOHMANN_JSON_PASTE13, \
+        NLOHMANN_JSON_PASTE12, \
+        NLOHMANN_JSON_PASTE11, \
+        NLOHMANN_JSON_PASTE10, \
+        NLOHMANN_JSON_PASTE9, \
+        NLOHMANN_JSON_PASTE8, \
+        NLOHMANN_JSON_PASTE7, \
+        NLOHMANN_JSON_PASTE6, \
+        NLOHMANN_JSON_PASTE5, \
+        NLOHMANN_JSON_PASTE4, \
+        NLOHMANN_JSON_PASTE3, \
+        NLOHMANN_JSON_PASTE2, \
+        NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
+#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
+#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
+#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
+#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
+#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
+#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
+#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
+#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
+#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
+#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
+#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
+#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
+#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
+#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
+#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
+#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
+#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
+#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
+#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
+#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
+#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
+#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
+#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
+#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24)
+#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25)
+#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26)
+#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27)
+#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28)
+#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29)
+#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30)
+#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32)
+#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33)
+#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34)
+#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35)
+#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36)
+#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37)
+#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38)
+#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39)
+#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40)
+#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41)
+#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42)
+#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43)
+#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44)
+#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45)
+#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46)
+#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47)
+#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48)
+#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49)
+#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50)
+#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51)
+#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52)
+#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53)
+#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54)
+#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55)
+#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56)
+#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57)
+#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58)
+#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59)
+#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60)
+#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61)
+#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62)
+#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63)
+
+#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
+#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+#ifndef JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_USE_IMPLICIT_CONVERSIONS 1
+#endif
+
+#if JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_EXPLICIT
+#else
+    #define JSON_EXPLICIT explicit
+#endif
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////////
+// exceptions //
+////////////////
+
+/*!
+@brief general exception of the @ref basic_json class
+
+This class is an extension of `std::exception` objects with a member @a id for
+exception ids. It is used as the base class for all exceptions thrown by the
+@ref basic_json class. This class can hence be used as "wildcard" to catch
+exceptions.
+
+Subclasses:
+- @ref parse_error for exceptions indicating a parse error
+- @ref invalid_iterator for exceptions indicating errors with iterators
+- @ref type_error for exceptions indicating executing a member function with
+                  a wrong type
+- @ref out_of_range for exceptions indicating access out of the defined range
+- @ref other_error for exceptions indicating other library errors
+
+@internal
+@note To have nothrow-copy-constructible exceptions, we internally use
+      `std::runtime_error` which can cope with arbitrary-length error messages.
+      Intermediate strings are built with static functions and then passed to
+      the actual constructor.
+@endinternal
+
+@liveexample{The following code shows how arbitrary library exceptions can be
+caught.,exception}
+
+@since version 3.0.0
+*/
+class exception : public std::exception
+{
+  public:
+    /// returns the explanatory string
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* what() const noexcept override
+    {
+        return m.what();
+    }
+
+    /// the id of the exception
+    const int id;
+
+  protected:
+    JSON_HEDLEY_NON_NULL(3)
+    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {}
+
+    static std::string name(const std::string& ename, int id_)
+    {
+        return "[json.exception." + ename + "." + std::to_string(id_) + "] ";
+    }
+
+  private:
+    /// an exception object as storage for error messages
+    std::runtime_error m;
+};
+
+/*!
+@brief exception indicating a parse error
+
+This exception is thrown by the library when a parse error occurs. Parse errors
+can occur during the deserialization of JSON text, CBOR, MessagePack, as well
+as when using JSON Patch.
+
+Member @a byte holds the byte index of the last read character in the input
+file.
+
+Exceptions have ids 1xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.parse_error.101 | parse error at 2: unexpected end of input; expected string literal | This error indicates a syntax error while deserializing a JSON text. The error message describes that an unexpected token (character) was encountered, and the member @a byte indicates the error position.
+json.exception.parse_error.102 | parse error at 14: missing or wrong low surrogate | JSON uses the `\uxxxx` format to describe Unicode characters. Code points above above 0xFFFF are split into two `\uxxxx` entries ("surrogate pairs"). This error indicates that the surrogate pair is incomplete or contains an invalid code point.
+json.exception.parse_error.103 | parse error: code points above 0x10FFFF are invalid | Unicode supports code points up to 0x10FFFF. Code points above 0x10FFFF are invalid.
+json.exception.parse_error.104 | parse error: JSON patch must be an array of objects | [RFC 6902](https://tools.ietf.org/html/rfc6902) requires a JSON Patch document to be a JSON document that represents an array of objects.
+json.exception.parse_error.105 | parse error: operation must have string member 'op' | An operation of a JSON Patch document must contain exactly one "op" member, whose value indicates the operation to perform. Its value must be one of "add", "remove", "replace", "move", "copy", or "test"; other values are errors.
+json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number without a leading `0`.
+json.exception.parse_error.107 | parse error: JSON pointer must be empty or begin with '/' - was: 'foo' | A JSON Pointer must be a Unicode string containing a sequence of zero or more reference tokens, each prefixed by a `/` character.
+json.exception.parse_error.108 | parse error: escape character '~' must be followed with '0' or '1' | In a JSON Pointer, only `~0` and `~1` are valid escape sequences.
+json.exception.parse_error.109 | parse error: array index 'one' is not a number | A JSON Pointer array index must be a number.
+json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vector | When parsing CBOR or MessagePack, the byte vector ends before the complete value has been read.
+json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read.
+json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read.
+json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet).
+json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed.
+
+@note For an input with n bytes, 1 is the index of the first character and n+1
+      is the index of the terminating null byte or the end of file. This also
+      holds true when reading a byte vector (CBOR or MessagePack).
+
+@liveexample{The following code shows how a `parse_error` exception can be
+caught.,parse_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class parse_error : public exception
+{
+  public:
+    /*!
+    @brief create a parse error exception
+    @param[in] id_       the id of the exception
+    @param[in] pos       the position where the error occurred (or with
+                         chars_read_total=0 if the position cannot be
+                         determined)
+    @param[in] what_arg  the explanatory string
+    @return parse_error object
+    */
+    static parse_error create(int id_, const position_t& pos, const std::string& what_arg)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        position_string(pos) + ": " + what_arg;
+        return parse_error(id_, pos.chars_read_total, w.c_str());
+    }
+
+    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        (byte_ != 0 ? (" at byte " + std::to_string(byte_)) : "") +
+                        ": " + what_arg;
+        return parse_error(id_, byte_, w.c_str());
+    }
+
+    /*!
+    @brief byte index of the parse error
+
+    The byte index of the last read character in the input file.
+
+    @note For an input with n bytes, 1 is the index of the first character and
+          n+1 is the index of the terminating null byte or the end of file.
+          This also holds true when reading a byte vector (CBOR or MessagePack).
+    */
+    const std::size_t byte;
+
+  private:
+    parse_error(int id_, std::size_t byte_, const char* what_arg)
+        : exception(id_, what_arg), byte(byte_) {}
+
+    static std::string position_string(const position_t& pos)
+    {
+        return " at line " + std::to_string(pos.lines_read + 1) +
+               ", column " + std::to_string(pos.chars_read_current_line);
+    }
+};
+
+/*!
+@brief exception indicating errors with iterators
+
+This exception is thrown if iterators passed to a library function do not match
+the expected semantics.
+
+Exceptions have ids 2xx.
+
+name / id                           | example message | description
+----------------------------------- | --------------- | -------------------------
+json.exception.invalid_iterator.201 | iterators are not compatible | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.202 | iterator does not fit current value | In an erase or insert function, the passed iterator @a pos does not belong to the JSON value for which the function was called. It hence does not define a valid position for the deletion/insertion.
+json.exception.invalid_iterator.203 | iterators do not fit current value | Either iterator passed to function @ref erase(IteratorType first, IteratorType last) does not belong to the JSON value from which values shall be erased. It hence does not define a valid range to delete values from.
+json.exception.invalid_iterator.204 | iterators out of range | When an iterator range for a primitive type (number, boolean, or string) is passed to a constructor or an erase function, this range has to be exactly (@ref begin(), @ref end()), because this is the only way the single stored value is expressed. All other ranges are invalid.
+json.exception.invalid_iterator.205 | iterator out of range | When an iterator for a primitive type (number, boolean, or string) is passed to an erase function, the iterator has to be the @ref begin() iterator, because it is the only way to address the stored value. All other iterators are invalid.
+json.exception.invalid_iterator.206 | cannot construct with iterators from null | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) belong to a JSON null value and hence to not define a valid range.
+json.exception.invalid_iterator.207 | cannot use key() for non-object iterators | The key() member function can only be used on iterators belonging to a JSON object, because other types do not have a concept of a key.
+json.exception.invalid_iterator.208 | cannot use operator[] for object iterators | The operator[] to specify a concrete offset cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.209 | cannot use offsets with object iterators | The offset operators (+, -, +=, -=) cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.210 | iterators do not fit | The iterator range passed to the insert function are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.211 | passed iterators may not belong to container | The iterator range passed to the insert function must not be a subrange of the container to insert to.
+json.exception.invalid_iterator.212 | cannot compare iterators of different containers | When two iterators are compared, they must belong to the same container.
+json.exception.invalid_iterator.213 | cannot compare order of object iterators | The order of object iterators cannot be compared, because JSON objects are unordered.
+json.exception.invalid_iterator.214 | cannot get value | Cannot get value for iterator: Either the iterator belongs to a null value or it is an iterator to a primitive type (number, boolean, or string), but the iterator is different to @ref begin().
+
+@liveexample{The following code shows how an `invalid_iterator` exception can be
+caught.,invalid_iterator}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class invalid_iterator : public exception
+{
+  public:
+    static invalid_iterator create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("invalid_iterator", id_) + what_arg;
+        return invalid_iterator(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    invalid_iterator(int id_, const char* what_arg)
+        : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating executing a member function with a wrong type
+
+This exception is thrown in case of a type error; that is, a library function is
+executed on a JSON value whose type does not match the expected semantics.
+
+Exceptions have ids 3xx.
+
+name / id                     | example message | description
+----------------------------- | --------------- | -------------------------
+json.exception.type_error.301 | cannot create object from initializer list | To create an object from an initializer list, the initializer list must consist only of a list of pairs whose first element is a string. When this constraint is violated, an array is created instead.
+json.exception.type_error.302 | type must be object, but is array | During implicit or explicit value conversion, the JSON type must be compatible to the target type. For instance, a JSON string can only be converted into string types, but not into numbers or boolean types.
+json.exception.type_error.303 | incompatible ReferenceType for get_ref, actual type is object | To retrieve a reference to a value stored in a @ref basic_json object with @ref get_ref, the type of the reference must match the value type. For instance, for a JSON array, the @a ReferenceType must be @ref array_t &.
+json.exception.type_error.304 | cannot use at() with string | The @ref at() member functions can only be executed for certain JSON types.
+json.exception.type_error.305 | cannot use operator[] with string | The @ref operator[] member functions can only be executed for certain JSON types.
+json.exception.type_error.306 | cannot use value() with string | The @ref value() member functions can only be executed for certain JSON types.
+json.exception.type_error.307 | cannot use erase() with string | The @ref erase() member functions can only be executed for certain JSON types.
+json.exception.type_error.308 | cannot use push_back() with string | The @ref push_back() and @ref operator+= member functions can only be executed for certain JSON types.
+json.exception.type_error.309 | cannot use insert() with | The @ref insert() member functions can only be executed for certain JSON types.
+json.exception.type_error.310 | cannot use swap() with number | The @ref swap() member functions can only be executed for certain JSON types.
+json.exception.type_error.311 | cannot use emplace_back() with string | The @ref emplace_back() member function can only be executed for certain JSON types.
+json.exception.type_error.312 | cannot use update() with string | The @ref update() member functions can only be executed for certain JSON types.
+json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined.
+json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers.
+json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive.
+json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. |
+json.exception.type_error.317 | JSON value cannot be serialized to requested format | The dynamic type of the object cannot be represented in the requested serialization format (e.g. a raw `true` or `null` JSON object cannot be serialized to BSON) |
+
+@liveexample{The following code shows how a `type_error` exception can be
+caught.,type_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class type_error : public exception
+{
+  public:
+    static type_error create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("type_error", id_) + what_arg;
+        return type_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating access out of the defined range
+
+This exception is thrown in case a library function is called on an input
+parameter that exceeds the expected range, for instance in case of array
+indices or nonexisting object keys.
+
+Exceptions have ids 4xx.
+
+name / id                       | example message | description
+------------------------------- | --------------- | -------------------------
+json.exception.out_of_range.401 | array index 3 is out of range | The provided array index @a i is larger than @a size-1.
+json.exception.out_of_range.402 | array index '-' (3) is out of range | The special array index `-` in a JSON Pointer never describes a valid element of the array, but the index past the end. That is, it can only be used to add elements at this position, but not to read it.
+json.exception.out_of_range.403 | key 'foo' not found | The provided key was not found in the JSON object.
+json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved.
+json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value.
+json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF.
+json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) |
+json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. |
+json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string |
+
+@liveexample{The following code shows how an `out_of_range` exception can be
+caught.,out_of_range}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class out_of_range : public exception
+{
+  public:
+    static out_of_range create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("out_of_range", id_) + what_arg;
+        return out_of_range(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating other library errors
+
+This exception is thrown in case of errors that cannot be classified with the
+other exception types.
+
+Exceptions have ids 5xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.other_error.501 | unsuccessful: {"op":"test","path":"/baz", "value":"bar"} | A JSON Patch operation 'test' failed. The unsuccessful operation is also printed.
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+
+@liveexample{The following code shows how an `other_error` exception can be
+caught.,other_error}
+
+@since version 3.0.0
+*/
+class other_error : public exception
+{
+  public:
+    static other_error create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("other_error", id_) + what_arg;
+        return other_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+#include <cstddef> // size_t
+#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
+
+namespace nlohmann
+{
+namespace detail
+{
+// alias templates to reduce boilerplate
+template<bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template<typename T>
+using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+// implementation of C++14 index_sequence and affiliates
+// source: https://stackoverflow.com/a/32223343
+template<std::size_t... Ints>
+struct index_sequence
+{
+    using type = index_sequence;
+    using value_type = std::size_t;
+    static constexpr std::size_t size() noexcept
+    {
+        return sizeof...(Ints);
+    }
+};
+
+template<class Sequence1, class Sequence2>
+struct merge_and_renumber;
+
+template<std::size_t... I1, std::size_t... I2>
+struct merge_and_renumber<index_sequence<I1...>, index_sequence<I2...>>
+        : index_sequence < I1..., (sizeof...(I1) + I2)... > {};
+
+template<std::size_t N>
+struct make_index_sequence
+    : merge_and_renumber < typename make_index_sequence < N / 2 >::type,
+      typename make_index_sequence < N - N / 2 >::type > {};
+
+template<> struct make_index_sequence<0> : index_sequence<> {};
+template<> struct make_index_sequence<1> : index_sequence<0> {};
+
+template<typename... Ts>
+using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+// dispatch utility (taken from ranges-v3)
+template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
+template<> struct priority_tag<0> {};
+
+// taken from ranges-v3
+template<typename T>
+struct static_const
+{
+    static constexpr T value{};
+};
+
+template<typename T>
+constexpr T static_const<T>::value;
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+#include <limits> // numeric_limits
+#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
+#include <utility> // declval
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+
+#include <iterator> // random_access_iterator_tag
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename ...Ts> struct make_void
+{
+    using type = void;
+};
+template<typename ...Ts> using void_t = typename make_void<Ts...>::type;
+} // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename It, typename = void>
+struct iterator_types {};
+
+template<typename It>
+struct iterator_types <
+    It,
+    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
+    typename It::reference, typename It::iterator_category >>
+{
+    using difference_type = typename It::difference_type;
+    using value_type = typename It::value_type;
+    using pointer = typename It::pointer;
+    using reference = typename It::reference;
+    using iterator_category = typename It::iterator_category;
+};
+
+// This is required as some compilers implement std::iterator_traits in a way that
+// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
+template<typename T, typename = void>
+struct iterator_traits
+{
+};
+
+template<typename T>
+struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
+            : iterator_types<T>
+{
+};
+
+template<typename T>
+struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
+{
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = T;
+    using difference_type = ptrdiff_t;
+    using pointer = T*;
+    using reference = T&;
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+
+#include <type_traits>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+// https://en.cppreference.com/w/cpp/experimental/is_detected
+namespace nlohmann
+{
+namespace detail
+{
+struct nonesuch
+{
+    nonesuch() = delete;
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const&) = delete;
+    nonesuch(nonesuch const&&) = delete;
+    void operator=(nonesuch const&) = delete;
+    void operator=(nonesuch&&) = delete;
+};
+
+template<class Default,
+         class AlwaysVoid,
+         template<class...> class Op,
+         class... Args>
+struct detector
+{
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template<class Default, template<class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...>
+{
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+template<template<class...> class Op, class... Args>
+using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+template<template<class...> class Op, class... Args>
+using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or = detector<Default, void, Op, Args...>;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+template<class Expected, template<class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template<class To, template<class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/json_fwd.hpp>
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
+#include <vector> // vector
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer,
+         class BinaryType = std::vector<std::uint8_t>>
+class basic_json;
+
+/*!
+@brief JSON Pointer
+
+A JSON pointer defines a string syntax for identifying a specific value
+within a JSON document. It can be used with functions `at` and
+`operator[]`. Furthermore, JSON pointers are the base for JSON patches.
+
+@sa [RFC 6901](https://tools.ietf.org/html/rfc6901)
+
+@since version 2.0.0
+*/
+template<typename BasicJsonType>
+class json_pointer;
+
+/*!
+@brief default JSON class
+
+This type is the default specialization of the @ref basic_json class which
+uses the standard template types.
+
+@since version 1.0.0
+*/
+using json = basic_json<>;
+
+template<class Key, class T, class IgnoredLess, class Allocator>
+struct ordered_map;
+
+/*!
+@brief ordered JSON class
+
+This type preserves the insertion order of object keys.
+
+@since version 3.9.0
+*/
+using ordered_json = basic_json<nlohmann::ordered_map>;
+
+}  // namespace nlohmann
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+
+namespace nlohmann
+{
+/*!
+@brief detail namespace with internal helper functions
+
+This namespace collects functions that should not be exposed,
+implementations of some @ref basic_json methods, and meta-programming helpers.
+
+@since version 2.1.0
+*/
+namespace detail
+{
+/////////////
+// helpers //
+/////////////
+
+// Note to maintainers:
+//
+// Every trait in this file expects a non CV-qualified type.
+// The only exceptions are in the 'aliases for detected' section
+// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
+//
+// In this case, T has to be properly CV-qualified to constraint the function arguments
+// (e.g. to_json(BasicJsonType&, const T&))
+
+template<typename> struct is_basic_json : std::false_type {};
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+
+//////////////////////
+// json_ref helpers //
+//////////////////////
+
+template<typename>
+class json_ref;
+
+template<typename>
+struct is_json_ref : std::false_type {};
+
+template<typename T>
+struct is_json_ref<json_ref<T>> : std::true_type {};
+
+//////////////////////////
+// aliases for detected //
+//////////////////////////
+
+template<typename T>
+using mapped_type_t = typename T::mapped_type;
+
+template<typename T>
+using key_type_t = typename T::key_type;
+
+template<typename T>
+using value_type_t = typename T::value_type;
+
+template<typename T>
+using difference_type_t = typename T::difference_type;
+
+template<typename T>
+using pointer_t = typename T::pointer;
+
+template<typename T>
+using reference_t = typename T::reference;
+
+template<typename T>
+using iterator_category_t = typename T::iterator_category;
+
+template<typename T>
+using iterator_t = typename T::iterator;
+
+template<typename T, typename... Args>
+using to_json_function = decltype(T::to_json(std::declval<Args>()...));
+
+template<typename T, typename... Args>
+using from_json_function = decltype(T::from_json(std::declval<Args>()...));
+
+template<typename T, typename U>
+using get_template_function = decltype(std::declval<T>().template get<U>());
+
+// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
+template<typename BasicJsonType, typename T, typename = void>
+struct has_from_json : std::false_type {};
+
+// trait checking if j.get<T> is valid
+// use this trait instead of std::is_constructible or std::is_convertible,
+// both rely on, or make use of implicit conversions, and thus fail when T
+// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
+template <typename BasicJsonType, typename T>
+struct is_getable
+{
+    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
+};
+
+template<typename BasicJsonType, typename T>
+struct has_from_json < BasicJsonType, T,
+           enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, from_json_function, serializer,
+        const BasicJsonType&, T&>::value;
+};
+
+// This trait checks if JSONSerializer<T>::from_json(json const&) exists
+// this overload is used for non-default-constructible user-defined-types
+template<typename BasicJsonType, typename T, typename = void>
+struct has_non_default_from_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<T, from_json_function, serializer,
+        const BasicJsonType&>::value;
+};
+
+// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
+// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
+template<typename BasicJsonType, typename T, typename = void>
+struct has_to_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
+        T>::value;
+};
+
+
+///////////////////
+// is_ functions //
+///////////////////
+
+template<typename T, typename = void>
+struct is_iterator_traits : std::false_type {};
+
+template<typename T>
+struct is_iterator_traits<iterator_traits<T>>
+{
+  private:
+    using traits = iterator_traits<T>;
+
+  public:
+    static constexpr auto value =
+        is_detected<value_type_t, traits>::value &&
+        is_detected<difference_type_t, traits>::value &&
+        is_detected<pointer_t, traits>::value &&
+        is_detected<iterator_category_t, traits>::value &&
+        is_detected<reference_t, traits>::value;
+};
+
+// source: https://stackoverflow.com/a/37193089/4116453
+
+template<typename T, typename = void>
+struct is_complete_type : std::false_type {};
+
+template<typename T>
+struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType,
+         typename = void>
+struct is_compatible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type_impl <
+    BasicJsonType, CompatibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, CompatibleObjectType>::value&&
+    is_detected<key_type_t, CompatibleObjectType>::value >>
+{
+
+    using object_t = typename BasicJsonType::object_t;
+
+    // macOS's is_constructible does not play well with nonesuch...
+    static constexpr bool value =
+        std::is_constructible<typename object_t::key_type,
+        typename CompatibleObjectType::key_type>::value &&
+        std::is_constructible<typename object_t::mapped_type,
+        typename CompatibleObjectType::mapped_type>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type
+    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         typename = void>
+struct is_constructible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type_impl <
+    BasicJsonType, ConstructibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, ConstructibleObjectType>::value&&
+    is_detected<key_type_t, ConstructibleObjectType>::value >>
+{
+    using object_t = typename BasicJsonType::object_t;
+
+    static constexpr bool value =
+        (std::is_default_constructible<ConstructibleObjectType>::value &&
+         (std::is_move_assignable<ConstructibleObjectType>::value ||
+          std::is_copy_assignable<ConstructibleObjectType>::value) &&
+         (std::is_constructible<typename ConstructibleObjectType::key_type,
+          typename object_t::key_type>::value &&
+          std::is_same <
+          typename object_t::mapped_type,
+          typename ConstructibleObjectType::mapped_type >::value)) ||
+        (has_from_json<BasicJsonType,
+         typename ConstructibleObjectType::mapped_type>::value ||
+         has_non_default_from_json <
+         BasicJsonType,
+         typename ConstructibleObjectType::mapped_type >::value);
+};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type
+    : is_constructible_object_type_impl<BasicJsonType,
+      ConstructibleObjectType> {};
+
+template<typename BasicJsonType, typename CompatibleStringType,
+         typename = void>
+struct is_compatible_string_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type_impl <
+    BasicJsonType, CompatibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, CompatibleStringType>::value >>
+{
+    static constexpr auto value =
+        std::is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_compatible_string_type
+    : is_compatible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template<typename BasicJsonType, typename ConstructibleStringType,
+         typename = void>
+struct is_constructible_string_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type_impl <
+    BasicJsonType, ConstructibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, ConstructibleStringType>::value >>
+{
+    static constexpr auto value =
+        std::is_constructible<ConstructibleStringType,
+        typename BasicJsonType::string_t>::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type
+    : is_constructible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template<typename BasicJsonType, typename CompatibleArrayType, typename = void>
+struct is_compatible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type_impl <
+    BasicJsonType, CompatibleArrayType,
+    enable_if_t < is_detected<value_type_t, CompatibleArrayType>::value&&
+    is_detected<iterator_t, CompatibleArrayType>::value&&
+// This is needed because json_reverse_iterator has a ::iterator type...
+// Therefore it is detected as a CompatibleArrayType.
+// The real fix would be to have an Iterable concept.
+    !is_iterator_traits <
+    iterator_traits<CompatibleArrayType >>::value >>
+{
+    static constexpr bool value =
+        std::is_constructible<BasicJsonType,
+        typename CompatibleArrayType::value_type>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type
+    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType, typename = void>
+struct is_constructible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t<std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value >>
+            : std::true_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t < !std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value&&
+    std::is_default_constructible<ConstructibleArrayType>::value&&
+(std::is_move_assignable<ConstructibleArrayType>::value ||
+ std::is_copy_assignable<ConstructibleArrayType>::value)&&
+is_detected<value_type_t, ConstructibleArrayType>::value&&
+is_detected<iterator_t, ConstructibleArrayType>::value&&
+is_complete_type <
+detected_t<value_type_t, ConstructibleArrayType >>::value >>
+{
+    static constexpr bool value =
+        // This is needed because json_reverse_iterator has a ::iterator type,
+        // furthermore, std::back_insert_iterator (and other iterators) have a
+        // base class `iterator`... Therefore it is detected as a
+        // ConstructibleArrayType. The real fix would be to have an Iterable
+        // concept.
+        !is_iterator_traits<iterator_traits<ConstructibleArrayType>>::value &&
+
+        (std::is_same<typename ConstructibleArrayType::value_type,
+         typename BasicJsonType::array_t::value_type>::value ||
+         has_from_json<BasicJsonType,
+         typename ConstructibleArrayType::value_type>::value ||
+         has_non_default_from_json <
+         BasicJsonType, typename ConstructibleArrayType::value_type >::value);
+};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type
+    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType,
+         typename = void>
+struct is_compatible_integer_type_impl : std::false_type {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type_impl <
+    RealIntegerType, CompatibleNumberIntegerType,
+    enable_if_t < std::is_integral<RealIntegerType>::value&&
+    std::is_integral<CompatibleNumberIntegerType>::value&&
+    !std::is_same<bool, CompatibleNumberIntegerType>::value >>
+{
+    // is there an assert somewhere on overflows?
+    using RealLimits = std::numeric_limits<RealIntegerType>;
+    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
+
+    static constexpr auto value =
+        std::is_constructible<RealIntegerType,
+        CompatibleNumberIntegerType>::value &&
+        CompatibleLimits::is_integer &&
+        RealLimits::is_signed == CompatibleLimits::is_signed;
+};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type
+    : is_compatible_integer_type_impl<RealIntegerType,
+      CompatibleNumberIntegerType> {};
+
+template<typename BasicJsonType, typename CompatibleType, typename = void>
+struct is_compatible_type_impl: std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type_impl <
+    BasicJsonType, CompatibleType,
+    enable_if_t<is_complete_type<CompatibleType>::value >>
+{
+    static constexpr bool value =
+        has_to_json<BasicJsonType, CompatibleType>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type
+    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
+
+// https://en.cppreference.com/w/cpp/types/conjunction
+template<class...> struct conjunction : std::true_type { };
+template<class B1> struct conjunction<B1> : B1 { };
+template<class B1, class... Bn>
+struct conjunction<B1, Bn...>
+: std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
+
+template<typename T1, typename T2>
+struct is_constructible_tuple : std::false_type {};
+
+template<typename T1, typename... Args>
+struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<std::is_constructible<T1, Args>...> {};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t
+#include <string> // string
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////////////
+// JSON type enumeration //
+///////////////////////////
+
+/*!
+@brief the JSON type enumeration
+
+This enumeration collects the different JSON types. It is internally used to
+distinguish the stored values, and the functions @ref basic_json::is_null(),
+@ref basic_json::is_object(), @ref basic_json::is_array(),
+@ref basic_json::is_string(), @ref basic_json::is_boolean(),
+@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
+@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
+@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
+@ref basic_json::is_structured() rely on it.
+
+@note There are three enumeration entries (number_integer, number_unsigned, and
+number_float), because the library distinguishes these three types for numbers:
+@ref basic_json::number_unsigned_t is used for unsigned integers,
+@ref basic_json::number_integer_t is used for signed integers, and
+@ref basic_json::number_float_t is used for floating-point numbers or to
+approximate integers which do not fit in the limits of their respective type.
+
+@sa @ref basic_json::basic_json(const value_t value_type) -- create a JSON
+value with the default value for a given type
+
+@since version 1.0.0
+*/
+enum class value_t : std::uint8_t
+{
+    null,             ///< null value
+    object,           ///< object (unordered set of name/value pairs)
+    array,            ///< array (ordered collection of values)
+    string,           ///< string value
+    boolean,          ///< boolean value
+    number_integer,   ///< number value (signed integer)
+    number_unsigned,  ///< number value (unsigned integer)
+    number_float,     ///< number value (floating-point)
+    binary,           ///< binary array (ordered collection of bytes)
+    discarded         ///< discarded by the parser callback function
+};
+
+/*!
+@brief comparison operator for JSON types
+
+Returns an ordering that is similar to Python:
+- order: null < boolean < number < object < array < string < binary
+- furthermore, each type is not smaller than itself
+- discarded values are not comparable
+- binary is represented as a b"" string in python and directly comparable to a
+  string; however, making a binary array directly comparable with a string would
+  be surprising behavior in a JSON file.
+
+@since version 1.0.0
+*/
+inline bool operator<(const value_t lhs, const value_t rhs) noexcept
+{
+    static constexpr std::array<std::uint8_t, 9> order = {{
+            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
+            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */,
+            6 /* binary */
+        }
+    };
+
+    const auto l_index = static_cast<std::size_t>(lhs);
+    const auto r_index = static_cast<std::size_t>(rhs);
+    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_null()))
+    {
+        JSON_THROW(type_error::create(302, "type must be null, but is " + std::string(j.type_name())));
+    }
+    n = nullptr;
+}
+
+// overloads for basic_json template parameters
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
+                         !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+                         int > = 0 >
+void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name())));
+    }
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean()))
+    {
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(j.type_name())));
+    }
+    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name())));
+    }
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template <
+    typename BasicJsonType, typename ConstructibleStringType,
+    enable_if_t <
+        is_constructible_string_type<BasicJsonType, ConstructibleStringType>::value&&
+        !std::is_same<typename BasicJsonType::string_t,
+                      ConstructibleStringType>::value,
+        int > = 0 >
+void from_json(const BasicJsonType& j, ConstructibleStringType& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name())));
+    }
+
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void from_json(const BasicJsonType& j, EnumType& e)
+{
+    typename std::underlying_type<EnumType>::type val;
+    get_arithmetic_value(j, val);
+    e = static_cast<EnumType>(val);
+}
+
+// forward_list doesn't have an insert method
+template<typename BasicJsonType, typename T, typename Allocator,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    l.clear();
+    std::transform(j.rbegin(), j.rend(),
+                   std::front_inserter(l), [](const BasicJsonType & i)
+    {
+        return i.template get<T>();
+    });
+}
+
+// valarray doesn't have an insert method
+template<typename BasicJsonType, typename T,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::valarray<T>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    l.resize(j.size());
+    std::transform(j.begin(), j.end(), std::begin(l),
+                   [](const BasicJsonType & elem)
+    {
+        return elem.template get<T>();
+    });
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json(const BasicJsonType& j, T (&arr)[N])
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType>
+void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
+{
+    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
+                          priority_tag<2> /*unused*/)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
+-> decltype(
+    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
+    j.template get<typename ConstructibleArrayType::value_type>(),
+    void())
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    ret.reserve(j.size());
+    std::transform(j.begin(), j.end(),
+                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
+                          priority_tag<0> /*unused*/)
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    std::transform(
+        j.begin(), j.end(), std::inserter(ret, end(ret)),
+        [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template < typename BasicJsonType, typename ConstructibleArrayType,
+           enable_if_t <
+               is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value&&
+               !is_basic_json<ConstructibleArrayType>::value,
+               int > = 0 >
+auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
+-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
+j.template get<typename ConstructibleArrayType::value_type>(),
+void())
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " +
+                                      std::string(j.type_name())));
+    }
+
+    from_json_array_impl(j, arr, priority_tag<3> {});
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_binary()))
+    {
+        JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(j.type_name())));
+    }
+
+    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
+}
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
+void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
+    {
+        JSON_THROW(type_error::create(302, "type must be object, but is " + std::string(j.type_name())));
+    }
+
+    ConstructibleObjectType ret;
+    auto inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
+    using value_type = typename ConstructibleObjectType::value_type;
+    std::transform(
+        inner_object->begin(), inner_object->end(),
+        std::inserter(ret, ret.begin()),
+        [](typename BasicJsonType::object_t::value_type const & p)
+    {
+        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
+    });
+    obj = std::move(ret);
+}
+
+// overload for arithmetic types, not chosen for basic_json template arguments
+// (BooleanType, etc..); note: Is it really necessary to provide explicit
+// overloads for boolean_t etc. in case of a custom BooleanType which is not
+// an arithmetic type?
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t <
+               std::is_arithmetic<ArithmeticType>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+               int > = 0 >
+void from_json(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+        case value_t::boolean:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
+            break;
+        }
+
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name())));
+    }
+}
+
+template<typename BasicJsonType, typename A1, typename A2>
+void from_json(const BasicJsonType& j, std::pair<A1, A2>& p)
+{
+    p = {j.at(0).template get<A1>(), j.at(1).template get<A2>()};
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void from_json_tuple_impl(const BasicJsonType& j, Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    t = std::make_tuple(j.at(Idx).template get<typename std::tuple_element<Idx, Tuple>::type>()...);
+}
+
+template<typename BasicJsonType, typename... Args>
+void from_json(const BasicJsonType& j, std::tuple<Args...>& t)
+{
+    from_json_tuple_impl(j, t, index_sequence_for<Args...> {});
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+struct from_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(const BasicJsonType& j, T& val) const
+    noexcept(noexcept(from_json(j, val)))
+    -> decltype(from_json(j, val), void())
+    {
+        return from_json(j, val);
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `from_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace
+{
+constexpr const auto& from_json = detail::static_const<detail::from_json_fn>::value;
+} // namespace
+} // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+
+#include <algorithm> // copy
+#include <iterator> // begin, end
+#include <string> // string
+#include <tuple> // tuple, get
+#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
+#include <utility> // move, forward, declval, pair
+#include <valarray> // valarray
+#include <vector> // vector
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+
+#include <cstddef> // size_t
+#include <iterator> // input_iterator_tag
+#include <string> // string, to_string
+#include <tuple> // tuple_size, get, tuple_element
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename string_type>
+void int_to_string( string_type& target, std::size_t value )
+{
+    // For ADL
+    using std::to_string;
+    target = to_string(value);
+}
+template<typename IteratorType> class iteration_proxy_value
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = iteration_proxy_value;
+    using pointer = value_type * ;
+    using reference = value_type & ;
+    using iterator_category = std::input_iterator_tag;
+    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;
+
+  private:
+    /// the iterator
+    IteratorType anchor;
+    /// an index for arrays (used to create key names)
+    std::size_t array_index = 0;
+    /// last stringified array index
+    mutable std::size_t array_index_last = 0;
+    /// a string representation of the array index
+    mutable string_type array_index_str = "0";
+    /// an empty string (to return a reference for primitive values)
+    const string_type empty_str = "";
+
+  public:
+    explicit iteration_proxy_value(IteratorType it) noexcept : anchor(it) {}
+
+    /// dereference operator (needed for range-based for)
+    iteration_proxy_value& operator*()
+    {
+        return *this;
+    }
+
+    /// increment operator (needed for range-based for)
+    iteration_proxy_value& operator++()
+    {
+        ++anchor;
+        ++array_index;
+
+        return *this;
+    }
+
+    /// equality operator (needed for InputIterator)
+    bool operator==(const iteration_proxy_value& o) const
+    {
+        return anchor == o.anchor;
+    }
+
+    /// inequality operator (needed for range-based for)
+    bool operator!=(const iteration_proxy_value& o) const
+    {
+        return anchor != o.anchor;
+    }
+
+    /// return key of the iterator
+    const string_type& key() const
+    {
+        JSON_ASSERT(anchor.m_object != nullptr);
+
+        switch (anchor.m_object->type())
+        {
+            // use integer array index as key
+            case value_t::array:
+            {
+                if (array_index != array_index_last)
+                {
+                    int_to_string( array_index_str, array_index );
+                    array_index_last = array_index;
+                }
+                return array_index_str;
+            }
+
+            // use key from the object
+            case value_t::object:
+                return anchor.key();
+
+            // use an empty key for all primitive types
+            default:
+                return empty_str;
+        }
+    }
+
+    /// return value of the iterator
+    typename IteratorType::reference value() const
+    {
+        return anchor.value();
+    }
+};
+
+/// proxy class for the items() function
+template<typename IteratorType> class iteration_proxy
+{
+  private:
+    /// the container to iterate
+    typename IteratorType::reference container;
+
+  public:
+    /// construct iteration proxy from a container
+    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
+        : container(cont) {}
+
+    /// return iterator begin (needed for range-based for)
+    iteration_proxy_value<IteratorType> begin() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.begin());
+    }
+
+    /// return iterator end (needed for range-based for)
+    iteration_proxy_value<IteratorType> end() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.end());
+    }
+};
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
+{
+    return i.key();
+}
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
+{
+    return i.value();
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+// The Addition to the STD Namespace is required to add
+// Structured Bindings Support to the iteration_proxy_value class
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+namespace std
+{
+#if defined(__clang__)
+    // Fix: https://github.com/nlohmann/json/issues/1401
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template<typename IteratorType>
+class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>>
+            : public std::integral_constant<std::size_t, 2> {};
+
+template<std::size_t N, typename IteratorType>
+class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >>
+{
+  public:
+    using type = decltype(
+                     get<N>(std::declval <
+                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
+};
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+} // namespace std
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////
+// constructors //
+//////////////////
+
+template<value_t> struct external_constructor;
+
+template<>
+struct external_constructor<value_t::boolean>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
+    {
+        j.m_type = value_t::boolean;
+        j.m_value = b;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::string>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = s;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = std::move(s);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleStringType,
+               enable_if_t < !std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleStringType& str)
+    {
+        j.m_type = value_t::string;
+        j.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::binary>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
+    {
+        j.m_type = value_t::binary;
+        typename BasicJsonType::binary_t value{b};
+        j.m_value = value;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
+    {
+        j.m_type = value_t::binary;
+        typename BasicJsonType::binary_t value{std::move(b)};
+        j.m_value = value;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_float>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
+    {
+        j.m_type = value_t::number_float;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_unsigned>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
+    {
+        j.m_type = value_t::number_unsigned;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_integer>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
+    {
+        j.m_type = value_t::number_integer;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::array>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = arr;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = std::move(arr);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleArrayType,
+               enable_if_t < !std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
+    {
+        using std::begin;
+        using std::end;
+        j.m_type = value_t::array;
+        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->reserve(arr.size());
+        for (const bool x : arr)
+        {
+            j.m_value.array->push_back(x);
+        }
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename T,
+             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->resize(arr.size());
+        if (arr.size() > 0)
+        {
+            std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin());
+        }
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::object>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = obj;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = std::move(obj);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleObjectType,
+               enable_if_t < !std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_type = value_t::object;
+        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.assert_invariant();
+    }
+};
+
+/////////////
+// to_json //
+/////////////
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
+void to_json(BasicJsonType& j, T b) noexcept
+{
+    external_constructor<value_t::boolean>::construct(j, b);
+}
+
+template<typename BasicJsonType, typename CompatibleString,
+         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
+void to_json(BasicJsonType& j, const CompatibleString& s)
+{
+    external_constructor<value_t::string>::construct(j, s);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+{
+    external_constructor<value_t::string>::construct(j, std::move(s));
+}
+
+template<typename BasicJsonType, typename FloatType,
+         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
+void to_json(BasicJsonType& j, FloatType val) noexcept
+{
+    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
+{
+    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberIntegerType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
+{
+    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void to_json(BasicJsonType& j, EnumType e) noexcept
+{
+    using underlying_type = typename std::underlying_type<EnumType>::type;
+    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const std::vector<bool>& e)
+{
+    external_constructor<value_t::array>::construct(j, e);
+}
+
+template < typename BasicJsonType, typename CompatibleArrayType,
+           enable_if_t < is_compatible_array_type<BasicJsonType,
+                         CompatibleArrayType>::value&&
+                         !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value&&
+                         !is_basic_json<CompatibleArrayType>::value,
+                         int > = 0 >
+void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin)
+{
+    external_constructor<value_t::binary>::construct(j, bin);
+}
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+void to_json(BasicJsonType& j, const std::valarray<T>& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template < typename BasicJsonType, typename CompatibleObjectType,
+           enable_if_t < is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value&& !is_basic_json<CompatibleObjectType>::value, int > = 0 >
+void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
+{
+    external_constructor<value_t::object>::construct(j, obj);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+{
+    external_constructor<value_t::object>::construct(j, std::move(obj));
+}
+
+template <
+    typename BasicJsonType, typename T, std::size_t N,
+    enable_if_t < !std::is_constructible<typename BasicJsonType::string_t,
+                  const T(&)[N]>::value,
+                  int > = 0 >
+void to_json(BasicJsonType& j, const T(&arr)[N])
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
+void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
+{
+    j = { p.first, p.second };
+}
+
+// for https://github.com/nlohmann/json/pull/1134
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
+void to_json(BasicJsonType& j, const T& b)
+{
+    j = { {b.key(), b.value()} };
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    j = { std::get<Idx>(t)... };
+}
+
+template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
+void to_json(BasicJsonType& j, const T& t)
+{
+    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
+}
+
+struct to_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
+    -> decltype(to_json(j, std::forward<T>(val)), void())
+    {
+        return to_json(j, std::forward<T>(val));
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `to_json` function
+namespace
+{
+constexpr const auto& to_json = detail::static_const<detail::to_json_fn>::value;
+} // namespace
+} // namespace nlohmann
+
+
+namespace nlohmann
+{
+
+template<typename, typename>
+struct adl_serializer
+{
+    /*!
+    @brief convert a JSON value to any value type
+
+    This function is usually called by the `get()` function of the
+    @ref basic_json class (either explicit or via conversion operators).
+
+    @param[in] j        JSON value to read from
+    @param[in,out] val  value to write to
+    */
+    template<typename BasicJsonType, typename ValueType>
+    static auto from_json(BasicJsonType&& j, ValueType& val) noexcept(
+        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
+    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
+    {
+        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
+    }
+
+    /*!
+    @brief convert any value type to a JSON value
+
+    This function is usually called by the constructors of the @ref basic_json
+    class.
+
+    @param[in,out] j  JSON value to write to
+    @param[in] val    value to read from
+    */
+    template<typename BasicJsonType, typename ValueType>
+    static auto to_json(BasicJsonType& j, ValueType&& val) noexcept(
+        noexcept(::nlohmann::to_json(j, std::forward<ValueType>(val))))
+    -> decltype(::nlohmann::to_json(j, std::forward<ValueType>(val)), void())
+    {
+        ::nlohmann::to_json(j, std::forward<ValueType>(val));
+    }
+};
+
+}  // namespace nlohmann
+
+// #include <nlohmann/byte_container_with_subtype.hpp>
+
+
+#include <cstdint> // uint8_t
+#include <tuple> // tie
+#include <utility> // move
+
+namespace nlohmann
+{
+
+/*!
+@brief an internal type for a backed binary type
+
+This type extends the template parameter @a BinaryType provided to `basic_json`
+with a subtype used by BSON and MessagePack. This type exists so that the user
+does not have to specify a type themselves with a specific naming scheme in
+order to override the binary type.
+
+@tparam BinaryType container to store bytes (`std::vector<std::uint8_t>` by
+                   default)
+
+@since version 3.8.0
+*/
+template<typename BinaryType>
+class byte_container_with_subtype : public BinaryType
+{
+  public:
+    /// the type of the underlying container
+    using container_type = BinaryType;
+
+    byte_container_with_subtype() noexcept(noexcept(container_type()))
+        : container_type()
+    {}
+
+    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+    {}
+
+    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+    {}
+
+    byte_container_with_subtype(const container_type& b, std::uint8_t subtype) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+        , m_subtype(subtype)
+        , m_has_subtype(true)
+    {}
+
+    byte_container_with_subtype(container_type&& b, std::uint8_t subtype) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+        , m_subtype(subtype)
+        , m_has_subtype(true)
+    {}
+
+    bool operator==(const byte_container_with_subtype& rhs) const
+    {
+        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
+               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
+    }
+
+    bool operator!=(const byte_container_with_subtype& rhs) const
+    {
+        return !(rhs == *this);
+    }
+
+    /*!
+    @brief sets the binary subtype
+
+    Sets the binary subtype of the value, also flags a binary JSON value as
+    having a subtype, which has implications for serialization.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref subtype() -- return the binary subtype
+    @sa @ref clear_subtype() -- clears the binary subtype
+    @sa @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    void set_subtype(std::uint8_t subtype) noexcept
+    {
+        m_subtype = subtype;
+        m_has_subtype = true;
+    }
+
+    /*!
+    @brief return the binary subtype
+
+    Returns the numerical subtype of the value if it has a subtype. If it does
+    not have a subtype, this function will return size_t(-1) as a sentinel
+    value.
+
+    @return the numerical subtype of the binary value
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref set_subtype() -- sets the binary subtype
+    @sa @ref clear_subtype() -- clears the binary subtype
+    @sa @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    constexpr std::uint8_t subtype() const noexcept
+    {
+        return m_subtype;
+    }
+
+    /*!
+    @brief return whether the value has a subtype
+
+    @return whether the value has a subtype
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref subtype() -- return the binary subtype
+    @sa @ref set_subtype() -- sets the binary subtype
+    @sa @ref clear_subtype() -- clears the binary subtype
+
+    @since version 3.8.0
+    */
+    constexpr bool has_subtype() const noexcept
+    {
+        return m_has_subtype;
+    }
+
+    /*!
+    @brief clears the binary subtype
+
+    Clears the binary subtype and flags the value as not having a subtype, which
+    has implications for serialization; for instance MessagePack will prefer the
+    bin family over the ext family.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref subtype() -- return the binary subtype
+    @sa @ref set_subtype() -- sets the binary subtype
+    @sa @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    void clear_subtype() noexcept
+    {
+        m_subtype = 0;
+        m_has_subtype = false;
+    }
+
+  private:
+    std::uint8_t m_subtype = 0;
+    bool m_has_subtype = false;
+};
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/hash.hpp>
+
+
+#include <cstddef> // size_t, uint8_t
+#include <functional> // hash
+
+namespace nlohmann
+{
+namespace detail
+{
+
+// boost::hash_combine
+inline std::size_t combine(std::size_t seed, std::size_t h) noexcept
+{
+    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
+    return seed;
+}
+
+/*!
+@brief hash a JSON value
+
+The hash function tries to rely on std::hash where possible. Furthermore, the
+type of the JSON value is taken into account to have different hash values for
+null, 0, 0U, and false, etc.
+
+@tparam BasicJsonType basic_json specialization
+@param j JSON value to hash
+@return hash value of j
+*/
+template<typename BasicJsonType>
+std::size_t hash(const BasicJsonType& j)
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+    const auto type = static_cast<std::size_t>(j.type());
+    switch (j.type())
+    {
+        case BasicJsonType::value_t::null:
+        case BasicJsonType::value_t::discarded:
+        {
+            return combine(type, 0);
+        }
+
+        case BasicJsonType::value_t::object:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j.items())
+            {
+                const auto h = std::hash<string_t> {}(element.key());
+                seed = combine(seed, h);
+                seed = combine(seed, hash(element.value()));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::array:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j)
+            {
+                seed = combine(seed, hash(element));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::string:
+        {
+            const auto h = std::hash<string_t> {}(j.template get_ref<const string_t&>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::boolean:
+        {
+            const auto h = std::hash<bool> {}(j.template get<bool>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_integer:
+        {
+            const auto h = std::hash<number_integer_t> {}(j.template get<number_integer_t>());
+            return combine(type, h);
+        }
+
+        case nlohmann::detail::value_t::number_unsigned:
+        {
+            const auto h = std::hash<number_unsigned_t> {}(j.template get<number_unsigned_t>());
+            return combine(type, h);
+        }
+
+        case nlohmann::detail::value_t::number_float:
+        {
+            const auto h = std::hash<number_float_t> {}(j.template get<number_float_t>());
+            return combine(type, h);
+        }
+
+        case nlohmann::detail::value_t::binary:
+        {
+            auto seed = combine(type, j.get_binary().size());
+            const auto h = std::hash<bool> {}(j.get_binary().has_subtype());
+            seed = combine(seed, h);
+            seed = combine(seed, j.get_binary().subtype());
+            for (const auto byte : j.get_binary())
+            {
+                seed = combine(seed, std::hash<std::uint8_t> {}(byte));
+            }
+            return seed;
+        }
+
+        default: // LCOV_EXCL_LINE
+            JSON_ASSERT(false); // LCOV_EXCL_LINE
+    }
+}
+
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+
+#include <algorithm> // generate_n
+#include <array> // array
+#include <cmath> // ldexp
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstdio> // snprintf
+#include <cstring> // memcpy
+#include <iterator> // back_inserter
+#include <limits> // numeric_limits
+#include <string> // char_traits, string
+#include <utility> // make_pair, move
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstdio> //FILE *
+#include <cstring> // strlen
+#include <istream> // istream
+#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
+#include <memory> // shared_ptr, make_shared, addressof
+#include <numeric> // accumulate
+#include <string> // string, char_traits
+#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
+#include <utility> // pair, declval
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// the supported input formats
+enum class input_format_t { json, cbor, msgpack, ubjson, bson };
+
+////////////////////
+// input adapters //
+////////////////////
+
+/*!
+Input adapter for stdio file access. This adapter read only 1 byte and do not use any
+ buffer. This adapter is a very low level adapter.
+*/
+class file_input_adapter
+{
+  public:
+    using char_type = char;
+
+    JSON_HEDLEY_NON_NULL(2)
+    explicit file_input_adapter(std::FILE* f) noexcept
+        : m_file(f)
+    {}
+
+    // make class move-only
+    file_input_adapter(const file_input_adapter&) = delete;
+    file_input_adapter(file_input_adapter&&) = default;
+    file_input_adapter& operator=(const file_input_adapter&) = delete;
+    file_input_adapter& operator=(file_input_adapter&&) = delete;
+
+    std::char_traits<char>::int_type get_character() noexcept
+    {
+        return std::fgetc(m_file);
+    }
+
+  private:
+    /// the file pointer to read from
+    std::FILE* m_file;
+};
+
+
+/*!
+Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
+beginning of input. Does not support changing the underlying std::streambuf
+in mid-input. Maintains underlying std::istream and std::streambuf to support
+subsequent use of standard std::istream operations to process any input
+characters following those used in parsing the JSON input.  Clears the
+std::istream flags; any input errors (e.g., EOF) will be detected by the first
+subsequent call for input from the std::istream.
+*/
+class input_stream_adapter
+{
+  public:
+    using char_type = char;
+
+    ~input_stream_adapter()
+    {
+        // clear stream flags; we use underlying streambuf I/O, do not
+        // maintain ifstream flags, except eof
+        if (is != nullptr)
+        {
+            is->clear(is->rdstate() & std::ios::eofbit);
+        }
+    }
+
+    explicit input_stream_adapter(std::istream& i)
+        : is(&i), sb(i.rdbuf())
+    {}
+
+    // delete because of pointer members
+    input_stream_adapter(const input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&& rhs) = delete;
+
+    input_stream_adapter(input_stream_adapter&& rhs) noexcept : is(rhs.is), sb(rhs.sb)
+    {
+        rhs.is = nullptr;
+        rhs.sb = nullptr;
+    }
+
+    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
+    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
+    // end up as the same value, eg. 0xFFFFFFFF.
+    std::char_traits<char>::int_type get_character()
+    {
+        auto res = sb->sbumpc();
+        // set eof manually, as we don't use the istream interface.
+        if (JSON_HEDLEY_UNLIKELY(res == EOF))
+        {
+            is->clear(is->rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
+  private:
+    /// the associated input stream
+    std::istream* is = nullptr;
+    std::streambuf* sb = nullptr;
+};
+
+// General-purpose iterator-based adapter. It might not be as fast as
+// theoretically possible for some containers, but it is extremely versatile.
+template<typename IteratorType>
+class iterator_input_adapter
+{
+  public:
+    using char_type = typename std::iterator_traits<IteratorType>::value_type;
+
+    iterator_input_adapter(IteratorType first, IteratorType last)
+        : current(std::move(first)), end(std::move(last)) {}
+
+    typename std::char_traits<char_type>::int_type get_character()
+    {
+        if (JSON_HEDLEY_LIKELY(current != end))
+        {
+            auto result = std::char_traits<char_type>::to_int_type(*current);
+            std::advance(current, 1);
+            return result;
+        }
+        else
+        {
+            return std::char_traits<char_type>::eof();
+        }
+    }
+
+  private:
+    IteratorType current;
+    IteratorType end;
+
+    template<typename BaseInputAdapter, size_t T>
+    friend struct wide_string_input_helper;
+
+    bool empty() const
+    {
+        return current == end;
+    }
+
+};
+
+
+template<typename BaseInputAdapter, size_t T>
+struct wide_string_input_helper;
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 4>
+{
+    // UTF-32
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+};
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 2>
+{
+    // UTF-16
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc || wc >= 0xE000)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (JSON_HEDLEY_UNLIKELY(!input.empty()))
+                {
+                    const auto wc2 = static_cast<unsigned int>(input.get_character());
+                    const auto charcode = 0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
+                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
+                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
+                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+};
+
+// Wraps another input apdater to convert wide character types into individual bytes.
+template<typename BaseInputAdapter, typename WideCharType>
+class wide_string_input_adapter
+{
+  public:
+    using char_type = char;
+
+    wide_string_input_adapter(BaseInputAdapter base)
+        : base_adapter(base) {}
+
+    typename std::char_traits<char>::int_type get_character() noexcept
+    {
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            fill_buffer<sizeof(WideCharType)>();
+
+            JSON_ASSERT(utf8_bytes_filled > 0);
+            JSON_ASSERT(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        JSON_ASSERT(utf8_bytes_filled > 0);
+        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
+        return utf8_bytes[utf8_bytes_index++];
+    }
+
+  private:
+    BaseInputAdapter base_adapter;
+
+    template<size_t T>
+    void fill_buffer()
+    {
+        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
+    }
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+};
+
+
+template<typename IteratorType, typename Enable = void>
+struct iterator_input_adapter_factory
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using adapter_type = iterator_input_adapter<iterator_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(std::move(first), std::move(last));
+    }
+};
+
+template<typename T>
+struct is_iterator_of_multibyte
+{
+    using value_type = typename std::iterator_traits<T>::value_type;
+    enum
+    {
+        value = sizeof(value_type) > 1
+    };
+};
+
+template<typename IteratorType>
+struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>>
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using base_adapter_type = iterator_input_adapter<iterator_type>;
+    using adapter_type = wide_string_input_adapter<base_adapter_type, char_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
+    }
+};
+
+// General purpose iterator-based input
+template<typename IteratorType>
+typename iterator_input_adapter_factory<IteratorType>::adapter_type input_adapter(IteratorType first, IteratorType last)
+{
+    using factory_type = iterator_input_adapter_factory<IteratorType>;
+    return factory_type::create(first, last);
+}
+
+// Convenience shorthand from container to iterator
+template<typename ContainerType>
+auto input_adapter(const ContainerType& container) -> decltype(input_adapter(begin(container), end(container)))
+{
+    // Enable ADL
+    using std::begin;
+    using std::end;
+
+    return input_adapter(begin(container), end(container));
+}
+
+// Special cases with fast paths
+inline file_input_adapter input_adapter(std::FILE* file)
+{
+    return file_input_adapter(file);
+}
+
+inline input_stream_adapter input_adapter(std::istream& stream)
+{
+    return input_stream_adapter(stream);
+}
+
+inline input_stream_adapter input_adapter(std::istream&& stream)
+{
+    return input_stream_adapter(stream);
+}
+
+using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));
+
+// Null-delimited strings, and the like.
+template < typename CharT,
+           typename std::enable_if <
+               std::is_pointer<CharT>::value&&
+               !std::is_array<CharT>::value&&
+               std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+               sizeof(typename std::remove_pointer<CharT>::type) == 1,
+               int >::type = 0 >
+contiguous_bytes_input_adapter input_adapter(CharT b)
+{
+    auto length = std::strlen(reinterpret_cast<const char*>(b));
+    const auto* ptr = reinterpret_cast<const char*>(b);
+    return input_adapter(ptr, ptr + length);
+}
+
+template<typename T, std::size_t N>
+auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N))
+{
+    return input_adapter(array, array + N);
+}
+
+// This class only handles inputs of input_buffer_adapter type.
+// It's required so that expressions like {ptr, len} can be implicitely casted
+// to the correct adapter.
+class span_input_adapter
+{
+  public:
+    template < typename CharT,
+               typename std::enable_if <
+                   std::is_pointer<CharT>::value&&
+                   std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+                   sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                   int >::type = 0 >
+    span_input_adapter(CharT b, std::size_t l)
+        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}
+
+    template<class IteratorType,
+             typename std::enable_if<
+                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
+                 int>::type = 0>
+    span_input_adapter(IteratorType first, IteratorType last)
+        : ia(input_adapter(first, last)) {}
+
+    contiguous_bytes_input_adapter&& get()
+    {
+        return std::move(ia);
+    }
+
+  private:
+    contiguous_bytes_input_adapter ia;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+
+#include <cstddef>
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template<typename BasicJsonType>
+struct json_sax
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief an floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool string(string_t& val) = 0;
+
+    /*!
+    @brief a binary string was read
+    @param[in] val  binary value
+    @return whether parsing should proceed
+    @note It is safe to move the passed binary.
+    */
+    virtual bool binary(binary_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] ex          an exception object describing the error
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool parse_error(std::size_t position,
+                             const std::string& last_token,
+                             const detail::exception& ex) = 0;
+
+    virtual ~json_sax() = default;
+};
+
+
+namespace detail
+{
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template<typename BasicJsonType>
+class json_sax_dom_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @param[in, out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
+        : root(r), allow_exceptions(allow_exceptions_)
+    {}
+
+    // make class move-only
+    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser(json_sax_dom_parser&&) = default;
+    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default;
+    ~json_sax_dom_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive object size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_value.object->operator[](val));
+        return true;
+    }
+
+    bool end_object()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive array size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template<typename Value>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    BasicJsonType* handle_value(Value&& v)
+    {
+        if (ref_stack.empty())
+        {
+            root = BasicJsonType(std::forward<Value>(v));
+            return &root;
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v));
+            return &(ref_stack.back()->m_value.array->back());
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_object());
+        JSON_ASSERT(object_element);
+        *object_element = BasicJsonType(std::forward<Value>(v));
+        return object_element;
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+template<typename BasicJsonType>
+class json_sax_dom_callback_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t = typename BasicJsonType::parse_event_t;
+
+    json_sax_dom_callback_parser(BasicJsonType& r,
+                                 const parser_callback_t cb,
+                                 const bool allow_exceptions_ = true)
+        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
+    {
+        keep_stack.push_back(true);
+    }
+
+    // make class move-only
+    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default;
+    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default;
+    ~json_sax_dom_callback_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        // check object limit
+        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep && ref_stack.back())
+        {
+            object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool end_object()
+    {
+        if (ref_stack.back() && !callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
+        {
+            // discard object
+            *ref_stack.back() = discarded;
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
+        {
+            // remove discarded value
+            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
+            {
+                if (it->is_discarded())
+                {
+                    ref_stack.back()->erase(it);
+                    break;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        // check array limit
+        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        bool keep = true;
+
+        if (ref_stack.back())
+        {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (!keep)
+            {
+                // discard array
+                *ref_stack.back() = discarded;
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->pop_back();
+        }
+
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template<typename Value>
+    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
+    {
+        JSON_ASSERT(!keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (!keep_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+        // check callback
+        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (!keep)
+        {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty())
+        {
+            root = std::move(value);
+            return {true, &root};
+        }
+
+        // skip this value if we already decided to skip the parent
+        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+        if (!ref_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // we now only expect arrays and objects
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        // array
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->push_back(std::move(value));
+            return {true, &(ref_stack.back()->m_value.array->back())};
+        }
+
+        // object
+        JSON_ASSERT(ref_stack.back()->is_object());
+        // check if we should store an element for the current key
+        JSON_ASSERT(!key_keep_stack.empty());
+        const bool store_element = key_keep_stack.back();
+        key_keep_stack.pop_back();
+
+        if (!store_element)
+        {
+            return {false, nullptr};
+        }
+
+        JSON_ASSERT(object_element);
+        *object_element = std::move(value);
+        return {true, object_element};
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack {};
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+};
+
+template<typename BasicJsonType>
+class json_sax_acceptor
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    bool null()
+    {
+        return true;
+    }
+
+    bool boolean(bool /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_integer(number_integer_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool string(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool binary(binary_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool start_object(std::size_t /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool key(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool end_object()
+    {
+        return true;
+    }
+
+    bool start_array(std::size_t /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool end_array()
+    {
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
+    {
+        return false;
+    }
+};
+}  // namespace detail
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+
+#include <array> // array
+#include <clocale> // localeconv
+#include <cstddef> // size_t
+#include <cstdio> // snprintf
+#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
+#include <initializer_list> // initializer_list
+#include <string> // char_traits, string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////
+// lexer //
+///////////
+
+template<typename BasicJsonType>
+class lexer_base
+{
+  public:
+    /// token types for the parser
+    enum class token_type
+    {
+        uninitialized,    ///< indicating the scanner is uninitialized
+        literal_true,     ///< the `true` literal
+        literal_false,    ///< the `false` literal
+        literal_null,     ///< the `null` literal
+        value_string,     ///< a string -- use get_string() for actual value
+        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
+        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
+        value_float,      ///< an floating point number -- use get_number_float() for actual value
+        begin_array,      ///< the character for array begin `[`
+        begin_object,     ///< the character for object begin `{`
+        end_array,        ///< the character for array end `]`
+        end_object,       ///< the character for object end `}`
+        name_separator,   ///< the name separator `:`
+        value_separator,  ///< the value separator `,`
+        parse_error,      ///< indicating a parse error
+        end_of_input,     ///< indicating the end of the input buffer
+        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
+    };
+
+    /// return name of values of type token_type (only used for errors)
+    JSON_HEDLEY_RETURNS_NON_NULL
+    JSON_HEDLEY_CONST
+    static const char* token_type_name(const token_type t) noexcept
+    {
+        switch (t)
+        {
+            case token_type::uninitialized:
+                return "<uninitialized>";
+            case token_type::literal_true:
+                return "true literal";
+            case token_type::literal_false:
+                return "false literal";
+            case token_type::literal_null:
+                return "null literal";
+            case token_type::value_string:
+                return "string literal";
+            case token_type::value_unsigned:
+            case token_type::value_integer:
+            case token_type::value_float:
+                return "number literal";
+            case token_type::begin_array:
+                return "'['";
+            case token_type::begin_object:
+                return "'{'";
+            case token_type::end_array:
+                return "']'";
+            case token_type::end_object:
+                return "'}'";
+            case token_type::name_separator:
+                return "':'";
+            case token_type::value_separator:
+                return "','";
+            case token_type::parse_error:
+                return "<parse error>";
+            case token_type::end_of_input:
+                return "end of input";
+            case token_type::literal_or_value:
+                return "'[', '{', or a literal";
+            // LCOV_EXCL_START
+            default: // catch non-enum values
+                return "unknown token";
+                // LCOV_EXCL_STOP
+        }
+    }
+};
+/*!
+@brief lexical analysis
+
+This class organizes the lexical analysis during JSON deserialization.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class lexer : public lexer_base<BasicJsonType>
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename std::char_traits<char_type>::int_type;
+
+  public:
+    using token_type = typename lexer_base<BasicJsonType>::token_type;
+
+    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false)
+        : ia(std::move(adapter))
+        , ignore_comments(ignore_comments_)
+        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
+    {}
+
+    // delete because of pointer members
+    lexer(const lexer&) = delete;
+    lexer(lexer&&) = default;
+    lexer& operator=(lexer&) = delete;
+    lexer& operator=(lexer&&) = default;
+    ~lexer() = default;
+
+  private:
+    /////////////////////
+    // locales
+    /////////////////////
+
+    /// return the locale-dependent decimal point
+    JSON_HEDLEY_PURE
+    static char get_decimal_point() noexcept
+    {
+        const auto* loc = localeconv();
+        JSON_ASSERT(loc != nullptr);
+        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
+    }
+
+    /////////////////////
+    // scan functions
+    /////////////////////
+
+    /*!
+    @brief get codepoint from 4 hex characters following `\u`
+
+    For input "\u c1 c2 c3 c4" the codepoint is:
+      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
+    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
+
+    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
+    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
+    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
+    between the ASCII value of the character and the desired integer value.
+
+    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
+            non-hex character)
+    */
+    int get_codepoint()
+    {
+        // this function only makes sense after reading `\u`
+        JSON_ASSERT(current == 'u');
+        int codepoint = 0;
+
+        const auto factors = { 12u, 8u, 4u, 0u };
+        for (const auto factor : factors)
+        {
+            get();
+
+            if (current >= '0' && current <= '9')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
+            }
+            else if (current >= 'A' && current <= 'F')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
+            }
+            else if (current >= 'a' && current <= 'f')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
+            }
+            else
+            {
+                return -1;
+            }
+        }
+
+        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
+        return codepoint;
+    }
+
+    /*!
+    @brief check if the next byte(s) are inside a given range
+
+    Adds the current byte and, for each passed range, reads a new byte and
+    checks if it is inside the range. If a violation was detected, set up an
+    error message and return false. Otherwise, return true.
+
+    @param[in] ranges  list of integers; interpreted as list of pairs of
+                       inclusive lower and upper bound, respectively
+
+    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
+         1, 2, or 3 pairs. This precondition is enforced by an assertion.
+
+    @return true if and only if no range violation was detected
+    */
+    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
+    {
+        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
+        add(current);
+
+        for (auto range = ranges.begin(); range != ranges.end(); ++range)
+        {
+            get();
+            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
+            {
+                add(current);
+            }
+            else
+            {
+                error_message = "invalid string: ill-formed UTF-8 byte";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief scan a string literal
+
+    This function scans a string according to Sect. 7 of RFC 7159. While
+    scanning, bytes are escaped and copied into buffer token_buffer. Then the
+    function returns successfully, token_buffer is *not* null-terminated (as it
+    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
+    string.
+
+    @return token_type::value_string if string could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note In case of errors, variable error_message contains a textual
+          description.
+    */
+    token_type scan_string()
+    {
+        // reset token_buffer (ignore opening quote)
+        reset();
+
+        // we entered the function by reading an open quote
+        JSON_ASSERT(current == '\"');
+
+        while (true)
+        {
+            // get next character
+            switch (get())
+            {
+                // end of file while parsing string
+                case std::char_traits<char_type>::eof():
+                {
+                    error_message = "invalid string: missing closing quote";
+                    return token_type::parse_error;
+                }
+
+                // closing quote
+                case '\"':
+                {
+                    return token_type::value_string;
+                }
+
+                // escapes
+                case '\\':
+                {
+                    switch (get())
+                    {
+                        // quotation mark
+                        case '\"':
+                            add('\"');
+                            break;
+                        // reverse solidus
+                        case '\\':
+                            add('\\');
+                            break;
+                        // solidus
+                        case '/':
+                            add('/');
+                            break;
+                        // backspace
+                        case 'b':
+                            add('\b');
+                            break;
+                        // form feed
+                        case 'f':
+                            add('\f');
+                            break;
+                        // line feed
+                        case 'n':
+                            add('\n');
+                            break;
+                        // carriage return
+                        case 'r':
+                            add('\r');
+                            break;
+                        // tab
+                        case 't':
+                            add('\t');
+                            break;
+
+                        // unicode escapes
+                        case 'u':
+                        {
+                            const int codepoint1 = get_codepoint();
+                            int codepoint = codepoint1; // start with codepoint1
+
+                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
+                            {
+                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                return token_type::parse_error;
+                            }
+
+                            // check if code point is a high surrogate
+                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
+                            {
+                                // expect next \uxxxx entry
+                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
+                                {
+                                    const int codepoint2 = get_codepoint();
+
+                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
+                                    {
+                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                        return token_type::parse_error;
+                                    }
+
+                                    // check if codepoint2 is a low surrogate
+                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
+                                    {
+                                        // overwrite codepoint
+                                        codepoint = static_cast<int>(
+                                                        // high surrogate occupies the most significant 22 bits
+                                                        (static_cast<unsigned int>(codepoint1) << 10u)
+                                                        // low surrogate occupies the least significant 15 bits
+                                                        + static_cast<unsigned int>(codepoint2)
+                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                                                        // in the result so we have to subtract with:
+                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                                                        - 0x35FDC00u);
+                                    }
+                                    else
+                                    {
+                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                        return token_type::parse_error;
+                                    }
+                                }
+                                else
+                                {
+                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+                            else
+                            {
+                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
+                                {
+                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+
+                            // result of the above calculation yields a proper codepoint
+                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
+
+                            // translate codepoint into bytes
+                            if (codepoint < 0x80)
+                            {
+                                // 1-byte characters: 0xxxxxxx (ASCII)
+                                add(static_cast<char_int_type>(codepoint));
+                            }
+                            else if (codepoint <= 0x7FF)
+                            {
+                                // 2-byte characters: 110xxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else if (codepoint <= 0xFFFF)
+                            {
+                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else
+                            {
+                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+
+                            break;
+                        }
+
+                        // other characters after escape
+                        default:
+                            error_message = "invalid string: forbidden character after backslash";
+                            return token_type::parse_error;
+                    }
+
+                    break;
+                }
+
+                // invalid control characters
+                case 0x00:
+                {
+                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
+                    return token_type::parse_error;
+                }
+
+                case 0x01:
+                {
+                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
+                    return token_type::parse_error;
+                }
+
+                case 0x02:
+                {
+                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
+                    return token_type::parse_error;
+                }
+
+                case 0x03:
+                {
+                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
+                    return token_type::parse_error;
+                }
+
+                case 0x04:
+                {
+                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
+                    return token_type::parse_error;
+                }
+
+                case 0x05:
+                {
+                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
+                    return token_type::parse_error;
+                }
+
+                case 0x06:
+                {
+                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
+                    return token_type::parse_error;
+                }
+
+                case 0x07:
+                {
+                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
+                    return token_type::parse_error;
+                }
+
+                case 0x08:
+                {
+                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
+                    return token_type::parse_error;
+                }
+
+                case 0x09:
+                {
+                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
+                    return token_type::parse_error;
+                }
+
+                case 0x0A:
+                {
+                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
+                    return token_type::parse_error;
+                }
+
+                case 0x0B:
+                {
+                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
+                    return token_type::parse_error;
+                }
+
+                case 0x0C:
+                {
+                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
+                    return token_type::parse_error;
+                }
+
+                case 0x0D:
+                {
+                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
+                    return token_type::parse_error;
+                }
+
+                case 0x0E:
+                {
+                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
+                    return token_type::parse_error;
+                }
+
+                case 0x0F:
+                {
+                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
+                    return token_type::parse_error;
+                }
+
+                case 0x10:
+                {
+                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
+                    return token_type::parse_error;
+                }
+
+                case 0x11:
+                {
+                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
+                    return token_type::parse_error;
+                }
+
+                case 0x12:
+                {
+                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
+                    return token_type::parse_error;
+                }
+
+                case 0x13:
+                {
+                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
+                    return token_type::parse_error;
+                }
+
+                case 0x14:
+                {
+                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
+                    return token_type::parse_error;
+                }
+
+                case 0x15:
+                {
+                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
+                    return token_type::parse_error;
+                }
+
+                case 0x16:
+                {
+                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
+                    return token_type::parse_error;
+                }
+
+                case 0x17:
+                {
+                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
+                    return token_type::parse_error;
+                }
+
+                case 0x18:
+                {
+                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
+                    return token_type::parse_error;
+                }
+
+                case 0x19:
+                {
+                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
+                    return token_type::parse_error;
+                }
+
+                case 0x1A:
+                {
+                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
+                    return token_type::parse_error;
+                }
+
+                case 0x1B:
+                {
+                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
+                    return token_type::parse_error;
+                }
+
+                case 0x1C:
+                {
+                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
+                    return token_type::parse_error;
+                }
+
+                case 0x1D:
+                {
+                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
+                    return token_type::parse_error;
+                }
+
+                case 0x1E:
+                {
+                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
+                    return token_type::parse_error;
+                }
+
+                case 0x1F:
+                {
+                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
+                    return token_type::parse_error;
+                }
+
+                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
+                case 0x20:
+                case 0x21:
+                case 0x23:
+                case 0x24:
+                case 0x25:
+                case 0x26:
+                case 0x27:
+                case 0x28:
+                case 0x29:
+                case 0x2A:
+                case 0x2B:
+                case 0x2C:
+                case 0x2D:
+                case 0x2E:
+                case 0x2F:
+                case 0x30:
+                case 0x31:
+                case 0x32:
+                case 0x33:
+                case 0x34:
+                case 0x35:
+                case 0x36:
+                case 0x37:
+                case 0x38:
+                case 0x39:
+                case 0x3A:
+                case 0x3B:
+                case 0x3C:
+                case 0x3D:
+                case 0x3E:
+                case 0x3F:
+                case 0x40:
+                case 0x41:
+                case 0x42:
+                case 0x43:
+                case 0x44:
+                case 0x45:
+                case 0x46:
+                case 0x47:
+                case 0x48:
+                case 0x49:
+                case 0x4A:
+                case 0x4B:
+                case 0x4C:
+                case 0x4D:
+                case 0x4E:
+                case 0x4F:
+                case 0x50:
+                case 0x51:
+                case 0x52:
+                case 0x53:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5A:
+                case 0x5B:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                case 0x60:
+                case 0x61:
+                case 0x62:
+                case 0x63:
+                case 0x64:
+                case 0x65:
+                case 0x66:
+                case 0x67:
+                case 0x68:
+                case 0x69:
+                case 0x6A:
+                case 0x6B:
+                case 0x6C:
+                case 0x6D:
+                case 0x6E:
+                case 0x6F:
+                case 0x70:
+                case 0x71:
+                case 0x72:
+                case 0x73:
+                case 0x74:
+                case 0x75:
+                case 0x76:
+                case 0x77:
+                case 0x78:
+                case 0x79:
+                case 0x7A:
+                case 0x7B:
+                case 0x7C:
+                case 0x7D:
+                case 0x7E:
+                case 0x7F:
+                {
+                    add(current);
+                    break;
+                }
+
+                // U+0080..U+07FF: bytes C2..DF 80..BF
+                case 0xC2:
+                case 0xC3:
+                case 0xC4:
+                case 0xC5:
+                case 0xC6:
+                case 0xC7:
+                case 0xC8:
+                case 0xC9:
+                case 0xCA:
+                case 0xCB:
+                case 0xCC:
+                case 0xCD:
+                case 0xCE:
+                case 0xCF:
+                case 0xD0:
+                case 0xD1:
+                case 0xD2:
+                case 0xD3:
+                case 0xD4:
+                case 0xD5:
+                case 0xD6:
+                case 0xD7:
+                case 0xD8:
+                case 0xD9:
+                case 0xDA:
+                case 0xDB:
+                case 0xDC:
+                case 0xDD:
+                case 0xDE:
+                case 0xDF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
+                case 0xE0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
+                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
+                case 0xE1:
+                case 0xE2:
+                case 0xE3:
+                case 0xE4:
+                case 0xE5:
+                case 0xE6:
+                case 0xE7:
+                case 0xE8:
+                case 0xE9:
+                case 0xEA:
+                case 0xEB:
+                case 0xEC:
+                case 0xEE:
+                case 0xEF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
+                case 0xED:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+                case 0xF0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+                case 0xF1:
+                case 0xF2:
+                case 0xF3:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+                case 0xF4:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // remaining bytes (80..C1 and F5..FF) are ill-formed
+                default:
+                {
+                    error_message = "invalid string: ill-formed UTF-8 byte";
+                    return token_type::parse_error;
+                }
+            }
+        }
+    }
+
+    /*!
+     * @brief scan a comment
+     * @return whether comment could be scanned successfully
+     */
+    bool scan_comment()
+    {
+        switch (get())
+        {
+            // single-line comments skip input until a newline or EOF is read
+            case '/':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case '\n':
+                        case '\r':
+                        case std::char_traits<char_type>::eof():
+                        case '\0':
+                            return true;
+
+                        default:
+                            break;
+                    }
+                }
+            }
+
+            // multi-line comments skip input until */ is read
+            case '*':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case std::char_traits<char_type>::eof():
+                        case '\0':
+                        {
+                            error_message = "invalid comment; missing closing '*/'";
+                            return false;
+                        }
+
+                        case '*':
+                        {
+                            switch (get())
+                            {
+                                case '/':
+                                    return true;
+
+                                default:
+                                {
+                                    unget();
+                                    continue;
+                                }
+                            }
+                        }
+
+                        default:
+                            continue;
+                    }
+                }
+            }
+
+            // unexpected character after reading '/'
+            default:
+            {
+                error_message = "invalid comment; expecting '/' or '*' after '/'";
+                return false;
+            }
+        }
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(float& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtof(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtod(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(long double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtold(str, endptr);
+    }
+
+    /*!
+    @brief scan a number literal
+
+    This function scans a string according to Sect. 6 of RFC 7159.
+
+    The function is realized with a deterministic finite state machine derived
+    from the grammar described in RFC 7159. Starting in state "init", the
+    input is read and used to determined the next state. Only state "done"
+    accepts the number. State "error" is a trap state to model errors. In the
+    table below, "anything" means any character but the ones listed before.
+
+    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
+    ---------|----------|----------|----------|---------|---------|----------|-----------
+    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
+    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
+    zero     | done     | done     | exponent | done    | done    | decimal1 | done
+    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
+    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
+    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
+    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
+    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
+    any2     | any2     | any2     | done     | done    | done    | done     | done
+
+    The state machine is realized with one label per state (prefixed with
+    "scan_number_") and `goto` statements between them. The state machine
+    contains cycles, but any cycle can be left when EOF is read. Therefore,
+    the function is guaranteed to terminate.
+
+    During scanning, the read bytes are stored in token_buffer. This string is
+    then converted to a signed integer, an unsigned integer, or a
+    floating-point number.
+
+    @return token_type::value_unsigned, token_type::value_integer, or
+            token_type::value_float if number could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note The scanner is independent of the current locale. Internally, the
+          locale's decimal point is used instead of `.` to work with the
+          locale-dependent converters.
+    */
+    token_type scan_number()  // lgtm [cpp/use-of-goto]
+    {
+        // reset token_buffer to store the number's bytes
+        reset();
+
+        // the type of the parsed number; initially set to unsigned; will be
+        // changed if minus sign, decimal point or exponent is read
+        token_type number_type = token_type::value_unsigned;
+
+        // state (init): we just found out we need to scan a number
+        switch (current)
+        {
+            case '-':
+            {
+                add(current);
+                goto scan_number_minus;
+            }
+
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            // all other characters are rejected outside scan_number()
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+
+scan_number_minus:
+        // state: we just parsed a leading minus sign
+        number_type = token_type::value_integer;
+        switch (get())
+        {
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '-'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_zero:
+        // state: we just parse a zero (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_any1:
+        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_decimal1:
+        // state: we just parsed a decimal point
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '.'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_decimal2:
+        // we just parsed at least one number after a decimal point
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_exponent:
+        // we just parsed an exponent
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '+':
+            case '-':
+            {
+                add(current);
+                goto scan_number_sign;
+            }
+
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message =
+                    "invalid number; expected '+', '-', or digit after exponent";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_sign:
+        // we just parsed an exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after exponent sign";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_any2:
+        // we just parsed a number after the exponent or exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_done:
+        // unget the character after the number (we only read it to know that
+        // we are done scanning a number)
+        unget();
+
+        char* endptr = nullptr;
+        errno = 0;
+
+        // try to parse integers first and fall back to floats
+        if (number_type == token_type::value_unsigned)
+        {
+            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_unsigned = static_cast<number_unsigned_t>(x);
+                if (value_unsigned == x)
+                {
+                    return token_type::value_unsigned;
+                }
+            }
+        }
+        else if (number_type == token_type::value_integer)
+        {
+            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_integer = static_cast<number_integer_t>(x);
+                if (value_integer == x)
+                {
+                    return token_type::value_integer;
+                }
+            }
+        }
+
+        // this code is reached if we parse a floating-point number or if an
+        // integer conversion above failed
+        strtof(value_float, token_buffer.data(), &endptr);
+
+        // we checked the number format before
+        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+        return token_type::value_float;
+    }
+
+    /*!
+    @param[in] literal_text  the literal text to expect
+    @param[in] length        the length of the passed literal text
+    @param[in] return_type   the token type to return on success
+    */
+    JSON_HEDLEY_NON_NULL(2)
+    token_type scan_literal(const char_type* literal_text, const std::size_t length,
+                            token_type return_type)
+    {
+        JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
+        for (std::size_t i = 1; i < length; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
+            {
+                error_message = "invalid literal";
+                return token_type::parse_error;
+            }
+        }
+        return return_type;
+    }
+
+    /////////////////////
+    // input management
+    /////////////////////
+
+    /// reset token_buffer; current character is beginning of token
+    void reset() noexcept
+    {
+        token_buffer.clear();
+        token_string.clear();
+        token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+    }
+
+    /*
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a
+    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
+    for use in error messages.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++position.chars_read_total;
+        ++position.chars_read_current_line;
+
+        if (next_unget)
+        {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        }
+        else
+        {
+            current = ia.get_character();
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        {
+            token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+        }
+
+        if (current == '\n')
+        {
+            ++position.lines_read;
+            position.chars_read_current_line = 0;
+        }
+
+        return current;
+    }
+
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read_total,
+    chars_read_current_line, and token_string. The next call to get() will
+    behave as if the unget character is read again.
+    */
+    void unget()
+    {
+        next_unget = true;
+
+        --position.chars_read_total;
+
+        // in case we "unget" a newline, we have to also decrement the lines_read
+        if (position.chars_read_current_line == 0)
+        {
+            if (position.lines_read > 0)
+            {
+                --position.lines_read;
+            }
+        }
+        else
+        {
+            --position.chars_read_current_line;
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        {
+            JSON_ASSERT(!token_string.empty());
+            token_string.pop_back();
+        }
+    }
+
+    /// add a character to token_buffer
+    void add(char_int_type c)
+    {
+        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
+    }
+
+  public:
+    /////////////////////
+    // value getters
+    /////////////////////
+
+    /// return integer value
+    constexpr number_integer_t get_number_integer() const noexcept
+    {
+        return value_integer;
+    }
+
+    /// return unsigned integer value
+    constexpr number_unsigned_t get_number_unsigned() const noexcept
+    {
+        return value_unsigned;
+    }
+
+    /// return floating-point value
+    constexpr number_float_t get_number_float() const noexcept
+    {
+        return value_float;
+    }
+
+    /// return current string value (implicitly resets the token; useful only once)
+    string_t& get_string()
+    {
+        return token_buffer;
+    }
+
+    /////////////////////
+    // diagnostics
+    /////////////////////
+
+    /// return position of last read token
+    constexpr position_t get_position() const noexcept
+    {
+        return position;
+    }
+
+    /// return the last read token (for errors only).  Will never contain EOF
+    /// (an arbitrary value that is not a valid char value, often -1), because
+    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
+    std::string get_token_string() const
+    {
+        // escape control characters
+        std::string result;
+        for (const auto c : token_string)
+        {
+            if (static_cast<unsigned char>(c) <= '\x1F')
+            {
+                // escape control characters
+                std::array<char, 9> cs{{}};
+                (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c));
+                result += cs.data();
+            }
+            else
+            {
+                // add character as is
+                result.push_back(static_cast<std::string::value_type>(c));
+            }
+        }
+
+        return result;
+    }
+
+    /// return syntax error message
+    JSON_HEDLEY_RETURNS_NON_NULL
+    constexpr const char* get_error_message() const noexcept
+    {
+        return error_message;
+    }
+
+    /////////////////////
+    // actual scanner
+    /////////////////////
+
+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool skip_bom()
+    {
+        if (get() == 0xEF)
+        {
+            // check if we completely parse the BOM
+            return get() == 0xBB && get() == 0xBF;
+        }
+
+        // the first character is not the beginning of the BOM; unget it to
+        // process is later
+        unget();
+        return true;
+    }
+
+    void skip_whitespace()
+    {
+        do
+        {
+            get();
+        }
+        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
+    }
+
+    token_type scan()
+    {
+        // initially, skip the BOM
+        if (position.chars_read_total == 0 && !skip_bom())
+        {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
+        // read next character and ignore whitespace
+        skip_whitespace();
+
+        // ignore comments
+        while (ignore_comments && current == '/')
+        {
+            if (!scan_comment())
+            {
+                return token_type::parse_error;
+            }
+
+            // skip following whitespace
+            skip_whitespace();
+        }
+
+        switch (current)
+        {
+            // structural characters
+            case '[':
+                return token_type::begin_array;
+            case ']':
+                return token_type::end_array;
+            case '{':
+                return token_type::begin_object;
+            case '}':
+                return token_type::end_object;
+            case ':':
+                return token_type::name_separator;
+            case ',':
+                return token_type::value_separator;
+
+            // literals
+            case 't':
+            {
+                std::array<char_type, 4> true_literal = {{'t', 'r', 'u', 'e'}};
+                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
+            }
+            case 'f':
+            {
+                std::array<char_type, 5> false_literal = {{'f', 'a', 'l', 's', 'e'}};
+                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
+            }
+            case 'n':
+            {
+                std::array<char_type, 4> null_literal = {{'n', 'u', 'l', 'l'}};
+                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
+            }
+
+            // string
+            case '\"':
+                return scan_string();
+
+            // number
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                return scan_number();
+
+            // end of input (the null byte is needed when parsing from
+            // string literals)
+            case '\0':
+            case std::char_traits<char_type>::eof():
+                return token_type::end_of_input;
+
+            // error
+            default:
+                error_message = "invalid literal";
+                return token_type::parse_error;
+        }
+    }
+
+  private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// whether comments should be ignored (true) or signaled as errors (false)
+    const bool ignore_comments = false;
+
+    /// the current character
+    char_int_type current = std::char_traits<char_type>::eof();
+
+    /// whether the next get() call should just return current
+    bool next_unget = false;
+
+    /// the start position of the current token
+    position_t position {};
+
+    /// raw input token string (for error messages)
+    std::vector<char_type> token_string {};
+
+    /// buffer for variable-length tokens (numbers, strings)
+    string_t token_buffer {};
+
+    /// a description of occurred lexer errors
+    const char* error_message = "";
+
+    // number values
+    number_integer_t value_integer = 0;
+    number_unsigned_t value_unsigned = 0;
+    number_float_t value_float = 0;
+
+    /// the decimal point
+    const char_int_type decimal_point_char = '.';
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+
+#include <cstdint> // size_t
+#include <utility> // declval
+#include <string> // string
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename T>
+using null_function_t = decltype(std::declval<T&>().null());
+
+template<typename T>
+using boolean_function_t =
+    decltype(std::declval<T&>().boolean(std::declval<bool>()));
+
+template<typename T, typename Integer>
+using number_integer_function_t =
+    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
+
+template<typename T, typename Unsigned>
+using number_unsigned_function_t =
+    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
+
+template<typename T, typename Float, typename String>
+using number_float_function_t = decltype(std::declval<T&>().number_float(
+                                    std::declval<Float>(), std::declval<const String&>()));
+
+template<typename T, typename String>
+using string_function_t =
+    decltype(std::declval<T&>().string(std::declval<String&>()));
+
+template<typename T, typename Binary>
+using binary_function_t =
+    decltype(std::declval<T&>().binary(std::declval<Binary&>()));
+
+template<typename T>
+using start_object_function_t =
+    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
+
+template<typename T, typename String>
+using key_function_t =
+    decltype(std::declval<T&>().key(std::declval<String&>()));
+
+template<typename T>
+using end_object_function_t = decltype(std::declval<T&>().end_object());
+
+template<typename T>
+using start_array_function_t =
+    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
+
+template<typename T>
+using end_array_function_t = decltype(std::declval<T&>().end_array());
+
+template<typename T, typename Exception>
+using parse_error_function_t = decltype(std::declval<T&>().parse_error(
+        std::declval<std::size_t>(), std::declval<const std::string&>(),
+        std::declval<const Exception&>()));
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static constexpr bool value =
+        is_detected_exact<bool, null_function_t, SAX>::value &&
+        is_detected_exact<bool, boolean_function_t, SAX>::value &&
+        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
+        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
+        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
+        is_detected_exact<bool, start_object_function_t, SAX>::value &&
+        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, end_object_function_t, SAX>::value &&
+        is_detected_exact<bool, start_array_function_t, SAX>::value &&
+        is_detected_exact<bool, end_array_function_t, SAX>::value &&
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
+};
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax_static_asserts
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
+                  "Missing/invalid function: bool null()");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value,
+        "Missing/invalid function: bool number_integer(number_integer_t)");
+    static_assert(
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value,
+        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
+    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
+                  number_float_t, string_t>::value,
+                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
+    static_assert(
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
+        "Missing/invalid function: bool string(string_t&)");
+    static_assert(
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
+        "Missing/invalid function: bool binary(binary_t&)");
+    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_object(std::size_t)");
+    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool key(string_t&)");
+    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_object()");
+    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_array(std::size_t)");
+    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_array()");
+    static_assert(
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
+        "Missing/invalid function: bool parse_error(std::size_t, const "
+        "std::string&, const exception&)");
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/// how to treat CBOR tags
+enum class cbor_tag_handler_t
+{
+    error,  ///< throw a parse_error exception in case of a tag
+    ignore   ///< ignore tags
+};
+
+/*!
+@brief determine system byte order
+
+@return true if and only if system's byte order is little endian
+
+@note from https://stackoverflow.com/a/1001328/266378
+*/
+static inline bool little_endianess(int num = 1) noexcept
+{
+    return *reinterpret_cast<char*>(&num) == 1;
+}
+
+
+///////////////////
+// binary reader //
+///////////////////
+
+/*!
+@brief deserialization of CBOR, MessagePack, and UBJSON values
+*/
+template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType>>
+class binary_reader
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using json_sax_t = SAX;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename std::char_traits<char_type>::int_type;
+
+  public:
+    /*!
+    @brief create a binary reader
+
+    @param[in] adapter  input adapter to read from
+    */
+    explicit binary_reader(InputAdapterType&& adapter) : ia(std::move(adapter))
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+    }
+
+    // make class move-only
+    binary_reader(const binary_reader&) = delete;
+    binary_reader(binary_reader&&) = default;
+    binary_reader& operator=(const binary_reader&) = delete;
+    binary_reader& operator=(binary_reader&&) = default;
+    ~binary_reader() = default;
+
+    /*!
+    @param[in] format  the binary format to parse
+    @param[in] sax_    a SAX event processor
+    @param[in] strict  whether to expect the input to be consumed completed
+    @param[in] tag_handler  how to treat CBOR tags
+
+    @return
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool sax_parse(const input_format_t format,
+                   json_sax_t* sax_,
+                   const bool strict = true,
+                   const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        sax = sax_;
+        bool result = false;
+
+        switch (format)
+        {
+            case input_format_t::bson:
+                result = parse_bson_internal();
+                break;
+
+            case input_format_t::cbor:
+                result = parse_cbor_internal(true, tag_handler);
+                break;
+
+            case input_format_t::msgpack:
+                result = parse_msgpack_internal();
+                break;
+
+            case input_format_t::ubjson:
+                result = parse_ubjson_internal();
+                break;
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+
+        // strict mode: next byte must be EOF
+        if (result && strict)
+        {
+            if (format == input_format_t::ubjson)
+            {
+                get_ignore_noop();
+            }
+            else
+            {
+                get();
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(current != std::char_traits<char_type>::eof()))
+            {
+                return sax->parse_error(chars_read, get_token_string(),
+                                        parse_error::create(110, chars_read, exception_message(format, "expected end of input; last byte: 0x" + get_token_string(), "value")));
+            }
+        }
+
+        return result;
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @brief Reads in a BSON-object and passes it to the SAX-parser.
+    @return whether a valid BSON-value was passed to the SAX parser
+    */
+    bool parse_bson_internal()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false)))
+        {
+            return false;
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @brief Parses a C-style string from the BSON input.
+    @param[in, out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @return `true` if the \x00-byte indicating the end of the string was
+             encountered before the EOF; false` indicates an unexpected EOF.
+    */
+    bool get_bson_cstr(string_t& result)
+    {
+        auto out = std::back_inserter(result);
+        while (true)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring")))
+            {
+                return false;
+            }
+            if (current == 0x00)
+            {
+                return true;
+            }
+            *out++ = static_cast<typename string_t::value_type>(current);
+        }
+    }
+
+    /*!
+    @brief Parses a zero-terminated string of length @a len from the BSON
+           input.
+    @param[in] len  The length (including the zero-byte at the end) of the
+                    string to be read.
+    @param[in, out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 1
+    @return `true` if the string was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_string(const NumberType len, string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 1))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "string length must be at least 1, is " + std::to_string(len), "string")));
+        }
+
+        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != std::char_traits<char_type>::eof();
+    }
+
+    /*!
+    @brief Parses a byte array input of length @a len from the BSON input.
+    @param[in] len  The length of the byte array to be read.
+    @param[in, out] result  A reference to the binary variable where the read
+                            array is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 0
+    @return `true` if the byte array was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_binary(const NumberType len, binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 0))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "byte array length cannot be negative, is " + std::to_string(len), "binary")));
+        }
+
+        // All BSON binary values have a subtype
+        std::uint8_t subtype{};
+        get_number<std::uint8_t>(input_format_t::bson, subtype);
+        result.set_subtype(subtype);
+
+        return get_binary(input_format_t::bson, len, result);
+    }
+
+    /*!
+    @brief Read a BSON document element of the given @a element_type.
+    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
+    @param[in] element_type_parse_position The position in the input stream,
+               where the `element_type` was read.
+    @warning Not all BSON element types are supported yet. An unsupported
+             @a element_type will give rise to a parse_error.114:
+             Unsupported BSON record type 0x...
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_internal(const char_int_type element_type,
+                                     const std::size_t element_type_parse_position)
+    {
+        switch (element_type)
+        {
+            case 0x01: // double
+            {
+                double number{};
+                return get_number<double, true>(input_format_t::bson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0x02: // string
+            {
+                std::int32_t len{};
+                string_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value);
+            }
+
+            case 0x03: // object
+            {
+                return parse_bson_internal();
+            }
+
+            case 0x04: // array
+            {
+                return parse_bson_array();
+            }
+
+            case 0x05: // binary
+            {
+                std::int32_t len{};
+                binary_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value);
+            }
+
+            case 0x08: // boolean
+            {
+                return sax->boolean(get() != 0);
+            }
+
+            case 0x0A: // null
+            {
+                return sax->null();
+            }
+
+            case 0x10: // int32
+            {
+                std::int32_t value{};
+                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            case 0x12: // int64
+            {
+                std::int64_t value{};
+                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            default: // anything else not supported (yet)
+            {
+                std::array<char, 3> cr{{}};
+                (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type));
+                return sax->parse_error(element_type_parse_position, std::string(cr.data()), parse_error::create(114, element_type_parse_position, "Unsupported BSON record type 0x" + std::string(cr.data())));
+            }
+        }
+    }
+
+    /*!
+    @brief Read a BSON element list (as specified in the BSON-spec)
+
+    The same binary layout is used for objects and arrays, hence it must be
+    indicated with the argument @a is_array which one is expected
+    (true --> array, false --> object).
+
+    @param[in] is_array Determines if the element list being read is to be
+                        treated as an object (@a is_array == false), or as an
+                        array (@a is_array == true).
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_list(const bool is_array)
+    {
+        string_t key;
+
+        while (auto element_type = get())
+        {
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list")))
+            {
+                return false;
+            }
+
+            const std::size_t element_type_parse_position = chars_read;
+            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key)))
+            {
+                return false;
+            }
+
+            if (!is_array && !sax->key(key))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position)))
+            {
+                return false;
+            }
+
+            // get_bson_cstr only appends
+            key.clear();
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief Reads an array from the BSON input and passes it to the SAX-parser.
+    @return whether a valid BSON-array was passed to the SAX parser
+    */
+    bool parse_bson_array()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true)))
+        {
+            return false;
+        }
+
+        return sax->end_array();
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true) or whether the last read character should
+                         be considered instead (false)
+    @param[in] tag_handler how CBOR tags should be treated
+
+    @return whether a valid CBOR value was passed to the SAX parser
+    */
+    bool parse_cbor_internal(const bool get_char,
+                             const cbor_tag_handler_t tag_handler)
+    {
+        switch (get_char ? get() : current)
+        {
+            // EOF
+            case std::char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::cbor, "value");
+
+            // Integer 0x00..0x17 (0..23)
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            case 0x18: // Unsigned integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x19: // Unsigned integer (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            // Negative integer -1-0x00..-1-0x17 (-1..-24)
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));
+
+            case 0x38: // Negative integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1)
+                        - static_cast<number_integer_t>(number));
+            }
+
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            case 0x5F: // Binary data (indefinite length)
+            {
+                binary_t b;
+                return get_cbor_binary(b) && sax->binary(b);
+            }
+
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                string_t s;
+                return get_cbor_string(s) && sax->string(s);
+            }
+
+            // array (0x00..0x17 data items follow)
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+                return get_cbor_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0x98: // array (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x99: // array (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9A: // array (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9B: // array (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9F: // array (indefinite length)
+                return get_cbor_array(std::size_t(-1), tag_handler);
+
+            // map (0x00..0x17 pairs of data items follow)
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+                return get_cbor_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0xB8: // map (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xB9: // map (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBA: // map (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBB: // map (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBF: // map (indefinite length)
+                return get_cbor_object(std::size_t(-1), tag_handler);
+
+            case 0xC6: // tagged item
+            case 0xC7:
+            case 0xC8:
+            case 0xC9:
+            case 0xCA:
+            case 0xCB:
+            case 0xCC:
+            case 0xCD:
+            case 0xCE:
+            case 0xCF:
+            case 0xD0:
+            case 0xD1:
+            case 0xD2:
+            case 0xD3:
+            case 0xD4:
+            case 0xD8: // tagged item (1 bytes follow)
+            case 0xD9: // tagged item (2 bytes follow)
+            case 0xDA: // tagged item (4 bytes follow)
+            case 0xDB: // tagged item (8 bytes follow)
+            {
+                switch (tag_handler)
+                {
+                    case cbor_tag_handler_t::error:
+                    {
+                        auto last_token = get_token_string();
+                        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value")));
+                    }
+
+                    case cbor_tag_handler_t::ignore:
+                    {
+                        switch (current)
+                        {
+                            case 0xD8:
+                            {
+                                std::uint8_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            case 0xD9:
+                            {
+                                std::uint16_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            case 0xDA:
+                            {
+                                std::uint32_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            case 0xDB:
+                            {
+                                std::uint64_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            default:
+                                break;
+                        }
+                        return parse_cbor_internal(true, tag_handler);
+                    }
+
+                    default:            // LCOV_EXCL_LINE
+                        JSON_ASSERT(false);  // LCOV_EXCL_LINE
+                }
+            }
+
+            case 0xF4: // false
+                return sax->boolean(false);
+
+            case 0xF5: // true
+                return sax->boolean(true);
+
+            case 0xF6: // null
+                return sax->null();
+
+            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
+            {
+                const auto byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+                const auto byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
+                const double val = [&half]
+                {
+                    const int exp = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    JSON_ASSERT(0 <= exp&& exp <= 32);
+                    JSON_ASSERT(mant <= 1024);
+                    switch (exp)
+                    {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0)
+                            ? std::numeric_limits<double>::infinity()
+                            : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float((half & 0x8000u) != 0
+                                         ? static_cast<number_float_t>(-val)
+                                         : static_cast<number_float_t>(val), "");
+            }
+
+            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
+            {
+                float number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
+            {
+                double number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            default: // anything else (0xFF is handled inside the other types)
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+    Additionally, CBOR's strings with indefinite lengths are supported.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_cbor_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            {
+                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    string_t chunk;
+                    if (!get_cbor_string(chunk))
+                    {
+                        return false;
+                    }
+                    result.append(chunk);
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x" + last_token, "string")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into the byte array.
+    Additionally, CBOR's byte arrays with indefinite lengths are supported.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_cbor_binary(binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            {
+                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5F: // Binary data (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    binary_t chunk;
+                    if (!get_cbor_binary(chunk))
+                    {
+                        return false;
+                    }
+                    result.insert(result.end(), chunk.begin(), chunk.end());
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x" + last_token, "binary")));
+            }
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array or std::size_t(-1) for an
+                    array of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether array creation completed
+    */
+    bool get_cbor_array(const std::size_t len,
+                        const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object or std::size_t(-1) for an
+                    object of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether object creation completed
+    */
+    bool get_cbor_object(const std::size_t len,
+                         const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    /*!
+    @return whether a valid MessagePack value was passed to the SAX parser
+    */
+    bool parse_msgpack_internal()
+    {
+        switch (get())
+        {
+            // EOF
+            case std::char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::msgpack, "value");
+
+            // positive fixint
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+            case 0x18:
+            case 0x19:
+            case 0x1A:
+            case 0x1B:
+            case 0x1C:
+            case 0x1D:
+            case 0x1E:
+            case 0x1F:
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+            case 0x38:
+            case 0x39:
+            case 0x3A:
+            case 0x3B:
+            case 0x3C:
+            case 0x3D:
+            case 0x3E:
+            case 0x3F:
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58:
+            case 0x59:
+            case 0x5A:
+            case 0x5B:
+            case 0x5C:
+            case 0x5D:
+            case 0x5E:
+            case 0x5F:
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78:
+            case 0x79:
+            case 0x7A:
+            case 0x7B:
+            case 0x7C:
+            case 0x7D:
+            case 0x7E:
+            case 0x7F:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            // fixmap
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+                return get_msgpack_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixarray
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+            case 0x98:
+            case 0x99:
+            case 0x9A:
+            case 0x9B:
+            case 0x9C:
+            case 0x9D:
+            case 0x9E:
+            case 0x9F:
+                return get_msgpack_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            case 0xD9: // str 8
+            case 0xDA: // str 16
+            case 0xDB: // str 32
+            {
+                string_t s;
+                return get_msgpack_string(s) && sax->string(s);
+            }
+
+            case 0xC0: // nil
+                return sax->null();
+
+            case 0xC2: // false
+                return sax->boolean(false);
+
+            case 0xC3: // true
+                return sax->boolean(true);
+
+            case 0xC4: // bin 8
+            case 0xC5: // bin 16
+            case 0xC6: // bin 32
+            case 0xC7: // ext 8
+            case 0xC8: // ext 16
+            case 0xC9: // ext 32
+            case 0xD4: // fixext 1
+            case 0xD5: // fixext 2
+            case 0xD6: // fixext 4
+            case 0xD7: // fixext 8
+            case 0xD8: // fixext 16
+            {
+                binary_t b;
+                return get_msgpack_binary(b) && sax->binary(b);
+            }
+
+            case 0xCA: // float 32
+            {
+                float number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCB: // float 64
+            {
+                double number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCC: // uint 8
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCD: // uint 16
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCE: // uint 32
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCF: // uint 64
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xD0: // int 8
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD1: // int 16
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD2: // int 32
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD3: // int 64
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xDC: // array 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDD: // array 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDE: // map 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xDF: // map 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            // negative fixint
+            case 0xE0:
+            case 0xE1:
+            case 0xE2:
+            case 0xE3:
+            case 0xE4:
+            case 0xE5:
+            case 0xE6:
+            case 0xE7:
+            case 0xE8:
+            case 0xE9:
+            case 0xEA:
+            case 0xEB:
+            case 0xEC:
+            case 0xED:
+            case 0xEE:
+            case 0xEF:
+            case 0xF0:
+            case 0xF1:
+            case 0xF2:
+            case 0xF3:
+            case 0xF4:
+            case 0xF5:
+            case 0xF6:
+            case 0xF7:
+            case 0xF8:
+            case 0xF9:
+            case 0xFA:
+            case 0xFB:
+            case 0xFC:
+            case 0xFD:
+            case 0xFE:
+            case 0xFF:
+                return sax->number_integer(static_cast<std::int8_t>(current));
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::msgpack, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_msgpack_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            {
+                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0xD9: // str 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDA: // str 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDB: // str 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::msgpack, "expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x" + last_token, "string")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into a byte array.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_msgpack_binary(binary_t& result)
+    {
+        // helper function to set the subtype
+        auto assign_and_return_true = [&result](std::int8_t subtype)
+        {
+            result.set_subtype(static_cast<std::uint8_t>(subtype));
+            return true;
+        };
+
+        switch (current)
+        {
+            case 0xC4: // bin 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC5: // bin 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC6: // bin 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC7: // ext 8
+            {
+                std::uint8_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC8: // ext 16
+            {
+                std::uint16_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC9: // ext 32
+            {
+                std::uint32_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD4: // fixext 1
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 1, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD5: // fixext 2
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 2, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD6: // fixext 4
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 4, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD7: // fixext 8
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 8, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD8: // fixext 16
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 16, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            default:           // LCOV_EXCL_LINE
+                return false;  // LCOV_EXCL_LINE
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array
+    @return whether array creation completed
+    */
+    bool get_msgpack_array(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object
+    @return whether object creation completed
+    */
+    bool get_msgpack_object(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key)))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+            key.clear();
+        }
+
+        return sax->end_object();
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid UBJSON value was passed to the SAX parser
+    */
+    bool parse_ubjson_internal(const bool get_char = true)
+    {
+        return get_ubjson_value(get_char ? get_ignore_noop() : current);
+    }
+
+    /*!
+    @brief reads a UBJSON string
+
+    This function is either called after reading the 'S' byte explicitly
+    indicating a string, or in case of an object key where the 'S' byte can be
+    left out.
+
+    @param[out] result   created string
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether string creation completed
+    */
+    bool get_ubjson_string(string_t& result, const bool get_char = true)
+    {
+        if (get_char)
+        {
+            get();  // TODO(niels): may we ignore N here?
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            case 'U':
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'i':
+            {
+                std::int8_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'I':
+            {
+                std::int16_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'l':
+            {
+                std::int32_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'L':
+            {
+                std::int64_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            default:
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token, "string")));
+        }
+    }
+
+    /*!
+    @param[out] result  determined size
+    @return whether size determination completed
+    */
+    bool get_ubjson_size_value(std::size_t& result)
+    {
+        switch (get_ignore_noop())
+        {
+            case 'U':
+            {
+                std::uint8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token, "size")));
+            }
+        }
+    }
+
+    /*!
+    @brief determine the type and size for a container
+
+    In the optimized UBJSON format, a type and a size can be provided to allow
+    for a more compact representation.
+
+    @param[out] result  pair of the size and the type
+
+    @return whether pair creation completed
+    */
+    bool get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result)
+    {
+        result.first = string_t::npos; // size
+        result.second = 0; // type
+
+        get_ignore_noop();
+
+        if (current == '$')
+        {
+            result.second = get();  // must not ignore 'N', because 'N' maybe the type
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "type")))
+            {
+                return false;
+            }
+
+            get_ignore_noop();
+            if (JSON_HEDLEY_UNLIKELY(current != '#'))
+            {
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
+                {
+                    return false;
+                }
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "expected '#' after type information; last byte: 0x" + last_token, "size")));
+            }
+
+            return get_ubjson_size_value(result.first);
+        }
+
+        if (current == '#')
+        {
+            return get_ubjson_size_value(result.first);
+        }
+
+        return true;
+    }
+
+    /*!
+    @param prefix  the previously read or set type prefix
+    @return whether value creation completed
+    */
+    bool get_ubjson_value(const char_int_type prefix)
+    {
+        switch (prefix)
+        {
+            case std::char_traits<char_type>::eof():  // EOF
+                return unexpect_eof(input_format_t::ubjson, "value");
+
+            case 'T':  // true
+                return sax->boolean(true);
+            case 'F':  // false
+                return sax->boolean(false);
+
+            case 'Z':  // null
+                return sax->null();
+
+            case 'U':
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_unsigned(number);
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'd':
+            {
+                float number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'D':
+            {
+                double number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'H':
+            {
+                return get_ubjson_high_precision_number();
+            }
+
+            case 'C':  // char
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "char")))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(current > 127))
+                {
+                    auto last_token = get_token_string();
+                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token, "char")));
+                }
+                string_t s(1, static_cast<typename string_t::value_type>(current));
+                return sax->string(s);
+            }
+
+            case 'S':  // string
+            {
+                string_t s;
+                return get_ubjson_string(s) && sax->string(s);
+            }
+
+            case '[':  // array
+                return get_ubjson_array();
+
+            case '{':  // object
+                return get_ubjson_object();
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @return whether array creation completed
+    */
+    bool get_ubjson_array()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                if (size_and_type.second != 'N')
+                {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i)
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != ']')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false)))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @return whether object creation completed
+    */
+    bool get_ubjson_object()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != '}')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key)))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    // Note, no reader for UBJSON binary types is implemented because they do
+    // not exist
+
+    bool get_ubjson_high_precision_number()
+    {
+        // get size of following number string
+        std::size_t size{};
+        auto res = get_ubjson_size_value(size);
+        if (JSON_HEDLEY_UNLIKELY(!res))
+        {
+            return res;
+        }
+
+        // get number string
+        std::vector<char> number_vector;
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number")))
+            {
+                return false;
+            }
+            number_vector.push_back(static_cast<char>(current));
+        }
+
+        // parse number string
+        auto number_ia = detail::input_adapter(std::forward<decltype(number_vector)>(number_vector));
+        auto number_lexer = detail::lexer<BasicJsonType, decltype(number_ia)>(std::move(number_ia), false);
+        const auto result_number = number_lexer.scan();
+        const auto number_string = number_lexer.get_token_string();
+        const auto result_remainder = number_lexer.scan();
+
+        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;
+
+        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
+        {
+            return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number")));
+        }
+
+        switch (result_number)
+        {
+            case token_type::value_integer:
+                return sax->number_integer(number_lexer.get_number_integer());
+            case token_type::value_unsigned:
+                return sax->number_unsigned(number_lexer.get_number_unsigned());
+            case token_type::value_float:
+                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
+            default:
+                return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number")));
+        }
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*!
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a -'ve valued
+    `std::char_traits<char_type>::eof()` in that case.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++chars_read;
+        return current = ia.get_character();
+    }
+
+    /*!
+    @return character read from the input after ignoring all 'N' entries
+    */
+    char_int_type get_ignore_noop()
+    {
+        do
+        {
+            get();
+        }
+        while (current == 'N');
+
+        return current;
+    }
+
+    /*
+    @brief read a number from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format   the current format (for diagnostics)
+    @param[out] result  number of type @a NumberType
+
+    @return whether conversion completed
+
+    @note This function needs to respect the system's endianess, because
+          bytes in CBOR, MessagePack, and UBJSON are stored in network order
+          (big endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool InputIsLittleEndian = false>
+    bool get_number(const input_format_t format, NumberType& result)
+    {
+        // step 1: read input into array with system's byte order
+        std::array<std::uint8_t, sizeof(NumberType)> vec;
+        for (std::size_t i = 0; i < sizeof(NumberType); ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number")))
+            {
+                return false;
+            }
+
+            // reverse byte order prior to conversion if necessary
+            if (is_little_endian != InputIsLittleEndian)
+            {
+                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
+            }
+            else
+            {
+                vec[i] = static_cast<std::uint8_t>(current); // LCOV_EXCL_LINE
+            }
+        }
+
+        // step 2: convert array into number of type T and return
+        std::memcpy(&result, vec.data(), sizeof(NumberType));
+        return true;
+    }
+
+    /*!
+    @brief create a string by reading characters from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of characters to read
+    @param[out] result string created by reading @a len bytes
+
+    @return whether string creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of string memory.
+    */
+    template<typename NumberType>
+    bool get_string(const input_format_t format,
+                    const NumberType len,
+                    string_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<typename string_t::value_type>(current));
+        };
+        return success;
+    }
+
+    /*!
+    @brief create a byte array by reading bytes from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of bytes to read
+    @param[out] result byte array created by reading @a len bytes
+
+    @return whether byte array creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of memory.
+    */
+    template<typename NumberType>
+    bool get_binary(const input_format_t format,
+                    const NumberType len,
+                    binary_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<std::uint8_t>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @param[in] format   the current format (for diagnostics)
+    @param[in] context  further context information (for diagnostics)
+    @return whether the last read character is not EOF
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool unexpect_eof(const input_format_t format, const char* context) const
+    {
+        if (JSON_HEDLEY_UNLIKELY(current == std::char_traits<char_type>::eof()))
+        {
+            return sax->parse_error(chars_read, "<end of file>",
+                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context)));
+        }
+        return true;
+    }
+
+    /*!
+    @return a string representation of the last read byte
+    */
+    std::string get_token_string() const
+    {
+        std::array<char, 3> cr{{}};
+        (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current));
+        return std::string{cr.data()};
+    }
+
+    /*!
+    @param[in] format   the current format
+    @param[in] detail   a detailed error message
+    @param[in] context  further context information
+    @return a message string to use in the parse_error exceptions
+    */
+    std::string exception_message(const input_format_t format,
+                                  const std::string& detail,
+                                  const std::string& context) const
+    {
+        std::string error_msg = "syntax error while parsing ";
+
+        switch (format)
+        {
+            case input_format_t::cbor:
+                error_msg += "CBOR";
+                break;
+
+            case input_format_t::msgpack:
+                error_msg += "MessagePack";
+                break;
+
+            case input_format_t::ubjson:
+                error_msg += "UBJSON";
+                break;
+
+            case input_format_t::bson:
+                error_msg += "BSON";
+                break;
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+
+        return error_msg + " " + context + ": " + detail;
+    }
+
+  private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// the current character
+    char_int_type current = std::char_traits<char_type>::eof();
+
+    /// the number of characters read
+    std::size_t chars_read = 0;
+
+    /// whether we can assume little endianess
+    const bool is_little_endian = little_endianess();
+
+    /// the SAX parser
+    json_sax_t* sax = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/input/parser.hpp>
+
+
+#include <cmath> // isfinite
+#include <cstdint> // uint8_t
+#include <functional> // function
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////
+// parser //
+////////////
+
+enum class parse_event_t : uint8_t
+{
+    /// the parser read `{` and started to process a JSON object
+    object_start,
+    /// the parser read `}` and finished processing a JSON object
+    object_end,
+    /// the parser read `[` and started to process a JSON array
+    array_start,
+    /// the parser read `]` and finished processing a JSON array
+    array_end,
+    /// the parser read a key of a value in an object
+    key,
+    /// the parser finished reading a JSON value
+    value
+};
+
+template<typename BasicJsonType>
+using parser_callback_t =
+    std::function<bool(int depth, parse_event_t event, BasicJsonType& parsed)>;
+
+/*!
+@brief syntax analysis
+
+This class implements a recursive descent parser.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class parser
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+    using token_type = typename lexer_t::token_type;
+
+  public:
+    /// a parser reading from an input adapter
+    explicit parser(InputAdapterType&& adapter,
+                    const parser_callback_t<BasicJsonType> cb = nullptr,
+                    const bool allow_exceptions_ = true,
+                    const bool skip_comments = false)
+        : callback(cb)
+        , m_lexer(std::move(adapter), skip_comments)
+        , allow_exceptions(allow_exceptions_)
+    {
+        // read first token
+        get_token();
+    }
+
+    /*!
+    @brief public parser interface
+
+    @param[in] strict      whether to expect the last token to be EOF
+    @param[in,out] result  parsed JSON value
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    void parse(const bool strict, BasicJsonType& result)
+    {
+        if (callback)
+        {
+            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value")));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+
+            // set top-level value to null if it was discarded by the callback
+            // function
+            if (result.is_discarded())
+            {
+                result = nullptr;
+            }
+        }
+        else
+        {
+            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value")));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+        }
+    }
+
+    /*!
+    @brief public accept interface
+
+    @param[in] strict  whether to expect the last token to be EOF
+    @return whether the input is a proper JSON text
+    */
+    bool accept(const bool strict = true)
+    {
+        json_sax_acceptor<BasicJsonType> sax_acceptor;
+        return sax_parse(&sax_acceptor, strict);
+    }
+
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse(SAX* sax, const bool strict = true)
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+        const bool result = sax_parse_internal(sax);
+
+        // strict mode: next byte must be EOF
+        if (result && strict && (get_token() != token_type::end_of_input))
+        {
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(),
+                                            exception_message(token_type::end_of_input, "value")));
+        }
+
+        return result;
+    }
+
+  private:
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse_internal(SAX* sax)
+    {
+        // stack to remember the hierarchy of structured values we are parsing
+        // true = array; false = object
+        std::vector<bool> states;
+        // value to avoid a goto (see comment where set to true)
+        bool skip_to_state_evaluation = false;
+
+        while (true)
+        {
+            if (!skip_to_state_evaluation)
+            {
+                // invariant: get_token() was called before each iteration
+                switch (last_token)
+                {
+                    case token_type::begin_object:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing } -> we are done
+                        if (get_token() == token_type::end_object)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // parse key
+                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            exception_message(token_type::value_string, "object key")));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        // parse separator (:)
+                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            exception_message(token_type::name_separator, "object separator")));
+                        }
+
+                        // remember we are now inside an object
+                        states.push_back(false);
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    case token_type::begin_array:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing ] -> we are done
+                        if (get_token() == token_type::end_array)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // remember we are now inside an array
+                        states.push_back(true);
+
+                        // parse values (no need to call get_token)
+                        continue;
+                    }
+
+                    case token_type::value_float:
+                    {
+                        const auto res = m_lexer.get_number_float();
+
+                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res)))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'"));
+                        }
+
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        break;
+                    }
+
+                    case token_type::literal_false:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_null:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->null()))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_true:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_integer:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_string:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_unsigned:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::parse_error:
+                    {
+                        // using "uninitialized" to avoid "expected" message
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::uninitialized, "value")));
+                    }
+
+                    default: // the last token was unexpected
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::literal_or_value, "value")));
+                    }
+                }
+            }
+            else
+            {
+                skip_to_state_evaluation = false;
+            }
+
+            // we reached this line after we successfully parsed a value
+            if (states.empty())
+            {
+                // empty stack: we reached the end of the hierarchy: done
+                return true;
+            }
+
+            if (states.back())  // array
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse a new value
+                    get_token();
+                    continue;
+                }
+
+                // closing ]
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this array. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    JSON_ASSERT(!states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(),
+                                                exception_message(token_type::end_array, "array")));
+            }
+            else  // object
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse key
+                    if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::value_string, "object key")));
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                    {
+                        return false;
+                    }
+
+                    // parse separator (:)
+                    if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::name_separator, "object separator")));
+                    }
+
+                    // parse values
+                    get_token();
+                    continue;
+                }
+
+                // closing }
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this object. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    JSON_ASSERT(!states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(),
+                                                exception_message(token_type::end_object, "object")));
+            }
+        }
+    }
+
+    /// get next token from lexer
+    token_type get_token()
+    {
+        return last_token = m_lexer.scan();
+    }
+
+    std::string exception_message(const token_type expected, const std::string& context)
+    {
+        std::string error_msg = "syntax error ";
+
+        if (!context.empty())
+        {
+            error_msg += "while parsing " + context + " ";
+        }
+
+        error_msg += "- ";
+
+        if (last_token == token_type::parse_error)
+        {
+            error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" +
+                         m_lexer.get_token_string() + "'";
+        }
+        else
+        {
+            error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token));
+        }
+
+        if (expected != token_type::uninitialized)
+        {
+            error_msg += "; expected " + std::string(lexer_t::token_type_name(expected));
+        }
+
+        return error_msg;
+    }
+
+  private:
+    /// callback function
+    const parser_callback_t<BasicJsonType> callback = nullptr;
+    /// the type of the last read token
+    token_type last_token = token_type::uninitialized;
+    /// the lexer
+    lexer_t m_lexer;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <limits>  // numeric_limits
+
+namespace nlohmann
+{
+namespace detail
+{
+/*
+@brief an iterator for primitive JSON types
+
+This class models an iterator for primitive JSON types (boolean, number,
+string). It's only purpose is to allow the iterator/const_iterator classes
+to "iterate" over primitive values. Internally, the iterator is modeled by
+a `difference_type` variable. Value begin_value (`0`) models the begin,
+end_value (`1`) models past the end.
+*/
+class primitive_iterator_t
+{
+  private:
+    using difference_type = std::ptrdiff_t;
+    static constexpr difference_type begin_value = 0;
+    static constexpr difference_type end_value = begin_value + 1;
+
+    /// iterator as signed integer type
+    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
+
+  public:
+    constexpr difference_type get_value() const noexcept
+    {
+        return m_it;
+    }
+
+    /// set iterator to a defined beginning
+    void set_begin() noexcept
+    {
+        m_it = begin_value;
+    }
+
+    /// set iterator to a defined past the end
+    void set_end() noexcept
+    {
+        m_it = end_value;
+    }
+
+    /// return whether the iterator can be dereferenced
+    constexpr bool is_begin() const noexcept
+    {
+        return m_it == begin_value;
+    }
+
+    /// return whether the iterator is at end
+    constexpr bool is_end() const noexcept
+    {
+        return m_it == end_value;
+    }
+
+    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it == rhs.m_it;
+    }
+
+    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it < rhs.m_it;
+    }
+
+    primitive_iterator_t operator+(difference_type n) noexcept
+    {
+        auto result = *this;
+        result += n;
+        return result;
+    }
+
+    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it - rhs.m_it;
+    }
+
+    primitive_iterator_t& operator++() noexcept
+    {
+        ++m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator++(int) noexcept
+    {
+        auto result = *this;
+        ++m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator--() noexcept
+    {
+        --m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator--(int) noexcept
+    {
+        auto result = *this;
+        --m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator+=(difference_type n) noexcept
+    {
+        m_it += n;
+        return *this;
+    }
+
+    primitive_iterator_t& operator-=(difference_type n) noexcept
+    {
+        m_it -= n;
+        return *this;
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/*!
+@brief an iterator value
+
+@note This structure could easily be a union, but MSVC currently does not allow
+unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
+*/
+template<typename BasicJsonType> struct internal_iterator
+{
+    /// iterator for JSON objects
+    typename BasicJsonType::object_t::iterator object_iterator {};
+    /// iterator for JSON arrays
+    typename BasicJsonType::array_t::iterator array_iterator {};
+    /// generic iterator for all other types
+    primitive_iterator_t primitive_iterator {};
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iter_impl.hpp>
+
+
+#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
+#include <type_traits> // conditional, is_const, remove_const
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+// forward declare, to be able to friend it later on
+template<typename IteratorType> class iteration_proxy;
+template<typename IteratorType> class iteration_proxy_value;
+
+/*!
+@brief a template for a bidirectional iterator for the @ref basic_json class
+This class implements a both iterators (iterator and const_iterator) for the
+@ref basic_json class.
+@note An iterator is called *initialized* when a pointer to a JSON value has
+      been set (e.g., by a constructor or a copy assignment). If the iterator is
+      default-constructed, it is *uninitialized* and most methods are undefined.
+      **The library uses assertions to detect calls on uninitialized iterators.**
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
+       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
+*/
+template<typename BasicJsonType>
+class iter_impl
+{
+    /// allow basic_json to access private members
+    friend iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
+    friend BasicJsonType;
+    friend iteration_proxy<iter_impl>;
+    friend iteration_proxy_value<iter_impl>;
+
+    using object_t = typename BasicJsonType::object_t;
+    using array_t = typename BasicJsonType::array_t;
+    // make sure BasicJsonType is basic_json or const basic_json
+    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
+                  "iter_impl only accepts (const) basic_json");
+
+  public:
+
+    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
+    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
+    /// A user-defined iterator should provide publicly accessible typedefs named
+    /// iterator_category, value_type, difference_type, pointer, and reference.
+    /// Note that value_type is required to be non-const, even for constant iterators.
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    /// the type of the values when the iterator is dereferenced
+    using value_type = typename BasicJsonType::value_type;
+    /// a type to represent differences between iterators
+    using difference_type = typename BasicJsonType::difference_type;
+    /// defines a pointer to the type iterated over (value_type)
+    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
+          typename BasicJsonType::const_pointer,
+          typename BasicJsonType::pointer>::type;
+    /// defines a reference to the type iterated over (value_type)
+    using reference =
+        typename std::conditional<std::is_const<BasicJsonType>::value,
+        typename BasicJsonType::const_reference,
+        typename BasicJsonType::reference>::type;
+
+    /// default constructor
+    iter_impl() = default;
+
+    /*!
+    @brief constructor for a given JSON instance
+    @param[in] object  pointer to a JSON object for this iterator
+    @pre object != nullptr
+    @post The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    explicit iter_impl(pointer object) noexcept : m_object(object)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = typename object_t::iterator();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = typename array_t::iterator();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator = primitive_iterator_t();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @note The conventional copy constructor and copy assignment are implicitly
+          defined. Combined with the following converting constructor and
+          assignment, they support: (1) copy from iterator to iterator, (2)
+          copy from const iterator to const iterator, and (3) conversion from
+          iterator to const iterator. However conversion from const iterator
+          to iterator is not defined.
+    */
+
+    /*!
+    @brief const copy constructor
+    @param[in] other const iterator to copy from
+    @note This copy constructor had to be defined explicitly to circumvent a bug
+          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
+          information refer to: https://github.com/nlohmann/json/issues/1608
+    */
+    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+    /*!
+    @brief converting constructor
+    @param[in] other  non-const iterator to copy from
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other  non-const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+  private:
+    /*!
+    @brief set the iterator to the first value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_begin() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->begin();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->begin();
+                break;
+            }
+
+            case value_t::null:
+            {
+                // set to end so begin()==end() is true: null is empty
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator.set_begin();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief set the iterator past the last value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_end() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->end();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->end();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+        }
+    }
+
+  public:
+    /*!
+    @brief return a reference to the value pointed to by the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator*() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                return m_it.object_iterator->second;
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                return *m_it.array_iterator;
+            }
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief dereference the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    pointer operator->() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                return &(m_it.object_iterator->second);
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                return &*m_it.array_iterator;
+            }
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief post-increment (it++)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator++(int)
+    {
+        auto result = *this;
+        ++(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-increment (++it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator++()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, 1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, 1);
+                break;
+            }
+
+            default:
+            {
+                ++m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief post-decrement (it--)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator--(int)
+    {
+        auto result = *this;
+        --(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-decrement (--it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator--()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, -1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, -1);
+                break;
+            }
+
+            default:
+            {
+                --m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief  comparison: equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator==(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers"));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                return (m_it.object_iterator == other.m_it.object_iterator);
+
+            case value_t::array:
+                return (m_it.array_iterator == other.m_it.array_iterator);
+
+            default:
+                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief  comparison: not equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator!=(const iter_impl& other) const
+    {
+        return !operator==(other);
+    }
+
+    /*!
+    @brief  comparison: smaller
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers"));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators"));
+
+            case value_t::array:
+                return (m_it.array_iterator < other.m_it.array_iterator);
+
+            default:
+                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief  comparison: less than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<=(const iter_impl& other) const
+    {
+        return !other.operator < (*this);
+    }
+
+    /*!
+    @brief  comparison: greater than
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>(const iter_impl& other) const
+    {
+        return !operator<=(other);
+    }
+
+    /*!
+    @brief  comparison: greater than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>=(const iter_impl& other) const
+    {
+        return !operator<(other);
+    }
+
+    /*!
+    @brief  add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator+=(difference_type i)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators"));
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, i);
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator += i;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief  subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator-=(difference_type i)
+    {
+        return operator+=(-i);
+    }
+
+    /*!
+    @brief  add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator+(difference_type i) const
+    {
+        auto result = *this;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief  addition of distance and iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    friend iter_impl operator+(difference_type i, const iter_impl& it)
+    {
+        auto result = it;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief  subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator-(difference_type i) const
+    {
+        auto result = *this;
+        result -= i;
+        return result;
+    }
+
+    /*!
+    @brief  return difference
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    difference_type operator-(const iter_impl& other) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators"));
+
+            case value_t::array:
+                return m_it.array_iterator - other.m_it.array_iterator;
+
+            default:
+                return m_it.primitive_iterator - other.m_it.primitive_iterator;
+        }
+    }
+
+    /*!
+    @brief  access to successor
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator[](difference_type n) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators"));
+
+            case value_t::array:
+                return *std::next(m_it.array_iterator, n);
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief  return the key of an object iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    const typename object_t::key_type& key() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
+        {
+            return m_it.object_iterator->first;
+        }
+
+        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators"));
+    }
+
+    /*!
+    @brief  return the value of an iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference value() const
+    {
+        return operator*();
+    }
+
+  private:
+    /// associated JSON instance
+    pointer m_object = nullptr;
+    /// the actual iterator of the associated instance
+    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <iterator> // reverse_iterator
+#include <utility> // declval
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////////
+// reverse_iterator //
+//////////////////////
+
+/*!
+@brief a template for a reverse iterator class
+
+@tparam Base the base iterator type to reverse. Valid types are @ref
+iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+create @ref const_reverse_iterator).
+
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
+  It is possible to write to the pointed-to element (only if @a Base is
+  @ref iterator).
+
+@since version 1.0.0
+*/
+template<typename Base>
+class json_reverse_iterator : public std::reverse_iterator<Base>
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    /// shortcut to the reverse iterator adapter
+    using base_iterator = std::reverse_iterator<Base>;
+    /// the reference type for the pointed-to element
+    using reference = typename Base::reference;
+
+    /// create reverse iterator from iterator
+    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
+        : base_iterator(it) {}
+
+    /// create reverse iterator from base class
+    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
+
+    /// post-increment (it++)
+    json_reverse_iterator const operator++(int)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
+    }
+
+    /// pre-increment (++it)
+    json_reverse_iterator& operator++()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
+    }
+
+    /// post-decrement (it--)
+    json_reverse_iterator const operator--(int)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
+    }
+
+    /// pre-decrement (--it)
+    json_reverse_iterator& operator--()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
+    }
+
+    /// add to iterator
+    json_reverse_iterator& operator+=(difference_type i)
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
+    }
+
+    /// add to iterator
+    json_reverse_iterator operator+(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
+    }
+
+    /// subtract from iterator
+    json_reverse_iterator operator-(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
+    }
+
+    /// return difference
+    difference_type operator-(const json_reverse_iterator& other) const
+    {
+        return base_iterator(*this) - base_iterator(other);
+    }
+
+    /// access to successor
+    reference operator[](difference_type n) const
+    {
+        return *(this->operator+(n));
+    }
+
+    /// return the key of an object iterator
+    auto key() const -> decltype(std::declval<Base>().key())
+    {
+        auto it = --this->base();
+        return it.key();
+    }
+
+    /// return the value of an iterator
+    reference value() const
+    {
+        auto it = --this->base();
+        return it.operator * ();
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/json_pointer.hpp>
+
+
+#include <algorithm> // all_of
+#include <cctype> // isdigit
+#include <limits> // max
+#include <numeric> // accumulate
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+template<typename BasicJsonType>
+class json_pointer
+{
+    // allow basic_json to access private members
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    friend class basic_json;
+
+  public:
+    /*!
+    @brief create JSON pointer
+
+    Create a JSON pointer according to the syntax described in
+    [Section 3 of RFC6901](https://tools.ietf.org/html/rfc6901#section-3).
+
+    @param[in] s  string representing the JSON pointer; if omitted, the empty
+                  string is assumed which references the whole JSON value
+
+    @throw parse_error.107 if the given JSON pointer @a s is nonempty and does
+                           not begin with a slash (`/`); see example below
+
+    @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s is
+    not followed by `0` (representing `~`) or `1` (representing `/`); see
+    example below
+
+    @liveexample{The example shows the construction several valid JSON pointers
+    as well as the exceptional behavior.,json_pointer}
+
+    @since version 2.0.0
+    */
+    explicit json_pointer(const std::string& s = "")
+        : reference_tokens(split(s))
+    {}
+
+    /*!
+    @brief return a string representation of the JSON pointer
+
+    @invariant For each JSON pointer `ptr`, it holds:
+    @code {.cpp}
+    ptr == json_pointer(ptr.to_string());
+    @endcode
+
+    @return a string representation of the JSON pointer
+
+    @liveexample{The example shows the result of `to_string`.,json_pointer__to_string}
+
+    @since version 2.0.0
+    */
+    std::string to_string() const
+    {
+        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
+                               std::string{},
+                               [](const std::string & a, const std::string & b)
+        {
+            return a + "/" + escape(b);
+        });
+    }
+
+    /// @copydoc to_string()
+    operator std::string() const
+    {
+        return to_string();
+    }
+
+    /*!
+    @brief append another JSON pointer at the end of this JSON pointer
+
+    @param[in] ptr  JSON pointer to append
+    @return JSON pointer with @a ptr appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::string) to append a reference token
+    @sa @ref operator/=(std::size_t) to append an array index
+    @sa @ref operator/(const json_pointer&, const json_pointer&) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(const json_pointer& ptr)
+    {
+        reference_tokens.insert(reference_tokens.end(),
+                                ptr.reference_tokens.begin(),
+                                ptr.reference_tokens.end());
+        return *this;
+    }
+
+    /*!
+    @brief append an unescaped reference token at the end of this JSON pointer
+
+    @param[in] token  reference token to append
+    @return JSON pointer with @a token appended without escaping @a token
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa @ref operator/=(std::size_t) to append an array index
+    @sa @ref operator/(const json_pointer&, std::size_t) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::string token)
+    {
+        push_back(std::move(token));
+        return *this;
+    }
+
+    /*!
+    @brief append an array index at the end of this JSON pointer
+
+    @param[in] array_idx  array index to append
+    @return JSON pointer with @a array_idx appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa @ref operator/=(std::string) to append a reference token
+    @sa @ref operator/(const json_pointer&, std::string) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::size_t array_idx)
+    {
+        return *this /= std::to_string(array_idx);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
+
+    @param[in] lhs  JSON pointer
+    @param[in] rhs  JSON pointer
+    @return a new JSON pointer with @a rhs appended to @a lhs
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a lhs and @a rhs.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& lhs,
+                                  const json_pointer& rhs)
+    {
+        return json_pointer(lhs) /= rhs;
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] token  reference token
+    @return a new JSON pointer with unescaped @a token appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::string) to append a reference token
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::string token)
+    {
+        return json_pointer(ptr) /= std::move(token);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] array_idx  array index
+    @return a new JSON pointer with @a array_idx appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::size_t) to append an array index
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::size_t array_idx)
+    {
+        return json_pointer(ptr) /= array_idx;
+    }
+
+    /*!
+    @brief returns the parent of this JSON pointer
+
+    @return parent of this JSON pointer; in case this JSON pointer is the root,
+            the root itself is returned
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @liveexample{The example shows the result of `parent_pointer` for different
+    JSON Pointers.,json_pointer__parent_pointer}
+
+    @since version 3.6.0
+    */
+    json_pointer parent_pointer() const
+    {
+        if (empty())
+        {
+            return *this;
+        }
+
+        json_pointer res = *this;
+        res.pop_back();
+        return res;
+    }
+
+    /*!
+    @brief remove last reference token
+
+    @pre not `empty()`
+
+    @liveexample{The example shows the usage of `pop_back`.,json_pointer__pop_back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    void pop_back()
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        reference_tokens.pop_back();
+    }
+
+    /*!
+    @brief return last reference token
+
+    @pre not `empty()`
+    @return last reference token
+
+    @liveexample{The example shows the usage of `back`.,json_pointer__back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    const std::string& back() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        return reference_tokens.back();
+    }
+
+    /*!
+    @brief append an unescaped token at the end of the reference pointer
+
+    @param[in] token  token to add
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows the result of `push_back` for different
+    JSON Pointers.,json_pointer__push_back}
+
+    @since version 3.6.0
+    */
+    void push_back(const std::string& token)
+    {
+        reference_tokens.push_back(token);
+    }
+
+    /// @copydoc push_back(const std::string&)
+    void push_back(std::string&& token)
+    {
+        reference_tokens.push_back(std::move(token));
+    }
+
+    /*!
+    @brief return whether pointer points to the root document
+
+    @return true iff the JSON pointer points to the root document
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example shows the result of `empty` for different JSON
+    Pointers.,json_pointer__empty}
+
+    @since version 3.6.0
+    */
+    bool empty() const noexcept
+    {
+        return reference_tokens.empty();
+    }
+
+  private:
+    /*!
+    @param[in] s  reference token to be converted into an array index
+
+    @return integer representation of @a s
+
+    @throw parse_error.106  if an array index begins with '0'
+    @throw parse_error.109  if an array index begins not with a digit
+    @throw out_of_range.404 if string @a s could not be converted to an integer
+    @throw out_of_range.410 if an array index exceeds size_type
+    */
+    static typename BasicJsonType::size_type array_index(const std::string& s)
+    {
+        using size_type = typename BasicJsonType::size_type;
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0'))
+        {
+            JSON_THROW(detail::parse_error::create(106, 0,
+                                                   "array index '" + s +
+                                                   "' must not begin with '0'"));
+        }
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9')))
+        {
+            JSON_THROW(detail::parse_error::create(109, 0, "array index '" + s + "' is not a number"));
+        }
+
+        std::size_t processed_chars = 0;
+        unsigned long long res = 0;
+        JSON_TRY
+        {
+            res = std::stoull(s, &processed_chars);
+        }
+        JSON_CATCH(std::out_of_range&)
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'"));
+        }
+
+        // check if the string was completely read
+        if (JSON_HEDLEY_UNLIKELY(processed_chars != s.size()))
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'"));
+        }
+
+        // only triggered on special platforms (like 32bit), see also
+        // https://github.com/nlohmann/json/pull/2203
+        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))
+        {
+            JSON_THROW(detail::out_of_range::create(410, "array index " + s + " exceeds size_type")); // LCOV_EXCL_LINE
+        }
+
+        return static_cast<size_type>(res);
+    }
+
+    json_pointer top() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        json_pointer result = *this;
+        result.reference_tokens = {reference_tokens[0]};
+        return result;
+    }
+
+    /*!
+    @brief create and return a reference to the pointed to value
+
+    @complexity Linear in the number of reference tokens.
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.313 if value cannot be unflattened
+    */
+    BasicJsonType& get_and_create(BasicJsonType& j) const
+    {
+        auto result = &j;
+
+        // in case no reference tokens exist, return a reference to the JSON value
+        // j which will be overwritten by a primitive value
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (result->type())
+            {
+                case detail::value_t::null:
+                {
+                    if (reference_token == "0")
+                    {
+                        // start a new array if reference token is 0
+                        result = &result->operator[](0);
+                    }
+                    else
+                    {
+                        // start a new object otherwise
+                        result = &result->operator[](reference_token);
+                    }
+                    break;
+                }
+
+                case detail::value_t::object:
+                {
+                    // create an entry in the object
+                    result = &result->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // create an entry in the array
+                    result = &result->operator[](array_index(reference_token));
+                    break;
+                }
+
+                /*
+                The following code is only reached if there exists a reference
+                token _and_ the current value is primitive. In this case, we have
+                an error situation, because primitive values may only occur as
+                single value; that is, with an empty list of reference tokens.
+                */
+                default:
+                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten"));
+            }
+        }
+
+        return *result;
+    }
+
+    /*!
+    @brief return a reference to the pointed to value
+
+    @note This version does not throw if a value is not present, but tries to
+          create nested values instead. For instance, calling this function
+          with pointer `"/this/that"` on a null value is equivalent to calling
+          `operator[]("this").operator[]("that")` on that value, effectively
+          changing the null value to an object.
+
+    @param[in] ptr  a JSON value
+
+    @return reference to the JSON value pointed to by the JSON pointer
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            // convert null values to arrays or objects before continuing
+            if (ptr->is_null())
+            {
+                // check if reference token is a number
+                const bool nums =
+                    std::all_of(reference_token.begin(), reference_token.end(),
+                                [](const unsigned char x)
+                {
+                    return std::isdigit(x);
+                });
+
+                // change value to array for numbers or "-" or to object otherwise
+                *ptr = (nums || reference_token == "-")
+                       ? detail::value_t::array
+                       : detail::value_t::object;
+            }
+
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (reference_token == "-")
+                    {
+                        // explicitly treat "-" as index beyond the end
+                        ptr = &ptr->operator[](ptr->m_value.array->size());
+                    }
+                    else
+                    {
+                        // convert array index to number; unchecked access
+                        ptr = &ptr->operator[](array_index(reference_token));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_checked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index(reference_token));
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @brief return a const reference to the pointed to value
+
+    @param[in] ptr  a JSON value
+
+    @return const reference to the JSON value pointed to by the JSON
+    pointer
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" cannot be used for const access
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // use unchecked array access
+                    ptr = &ptr->operator[](array_index(reference_token));
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index(reference_token));
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    */
+    bool contains(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    if (!ptr->contains(reference_token))
+                    {
+                        // we did not find the key in the object
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9")))
+                    {
+                        // invalid char
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1))
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9')))
+                        {
+                            // first char should be between '1' and '9'
+                            return false;
+                        }
+                        for (std::size_t i = 1; i < reference_token.size(); i++)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9')))
+                            {
+                                // other char should be between '0' and '9'
+                                return false;
+                            }
+                        }
+                    }
+
+                    const auto idx = array_index(reference_token);
+                    if (idx >= ptr->size())
+                    {
+                        // index out of range
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](idx);
+                    break;
+                }
+
+                default:
+                {
+                    // we do not expect primitive values if there is still a
+                    // reference token to process
+                    return false;
+                }
+            }
+        }
+
+        // no reference token left means we found a primitive value
+        return true;
+    }
+
+    /*!
+    @brief split the string input to reference tokens
+
+    @note This function is only called by the json_pointer constructor.
+          All exceptions below are documented there.
+
+    @throw parse_error.107  if the pointer is not empty or begins with '/'
+    @throw parse_error.108  if character '~' is not followed by '0' or '1'
+    */
+    static std::vector<std::string> split(const std::string& reference_string)
+    {
+        std::vector<std::string> result;
+
+        // special case: empty reference string -> no reference tokens
+        if (reference_string.empty())
+        {
+            return result;
+        }
+
+        // check if nonempty reference string begins with slash
+        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
+        {
+            JSON_THROW(detail::parse_error::create(107, 1,
+                                                   "JSON pointer must be empty or begin with '/' - was: '" +
+                                                   reference_string + "'"));
+        }
+
+        // extract the reference tokens:
+        // - slash: position of the last read slash (or end of string)
+        // - start: position after the previous slash
+        for (
+            // search for the first slash after the first character
+            std::size_t slash = reference_string.find_first_of('/', 1),
+            // set the beginning of the first reference token
+            start = 1;
+            // we can stop if start == 0 (if slash == std::string::npos)
+            start != 0;
+            // set the beginning of the next reference token
+            // (will eventually be 0 if slash == std::string::npos)
+            start = (slash == std::string::npos) ? 0 : slash + 1,
+            // find next slash
+            slash = reference_string.find_first_of('/', start))
+        {
+            // use the text between the beginning of the reference token
+            // (start) and the last slash (slash).
+            auto reference_token = reference_string.substr(start, slash - start);
+
+            // check reference tokens are properly escaped
+            for (std::size_t pos = reference_token.find_first_of('~');
+                    pos != std::string::npos;
+                    pos = reference_token.find_first_of('~', pos + 1))
+            {
+                JSON_ASSERT(reference_token[pos] == '~');
+
+                // ~ must be followed by 0 or 1
+                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
+                                         (reference_token[pos + 1] != '0' &&
+                                          reference_token[pos + 1] != '1')))
+                {
+                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'"));
+                }
+            }
+
+            // finally, store the reference token
+            unescape(reference_token);
+            result.push_back(reference_token);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief replace all occurrences of a substring by another string
+
+    @param[in,out] s  the string to manipulate; changed so that all
+                   occurrences of @a f are replaced with @a t
+    @param[in]     f  the substring to replace with @a t
+    @param[in]     t  the string to replace @a f
+
+    @pre The search string @a f must not be empty. **This precondition is
+    enforced with an assertion.**
+
+    @since version 2.0.0
+    */
+    static void replace_substring(std::string& s, const std::string& f,
+                                  const std::string& t)
+    {
+        JSON_ASSERT(!f.empty());
+        for (auto pos = s.find(f);                // find first occurrence of f
+                pos != std::string::npos;         // make sure f was found
+                s.replace(pos, f.size(), t),      // replace with t, and
+                pos = s.find(f, pos + t.size()))  // find next occurrence of f
+        {}
+    }
+
+    /// escape "~" to "~0" and "/" to "~1"
+    static std::string escape(std::string s)
+    {
+        replace_substring(s, "~", "~0");
+        replace_substring(s, "/", "~1");
+        return s;
+    }
+
+    /// unescape "~1" to tilde and "~0" to slash (order is important!)
+    static void unescape(std::string& s)
+    {
+        replace_substring(s, "~1", "/");
+        replace_substring(s, "~0", "~");
+    }
+
+    /*!
+    @param[in] reference_string  the reference string to the current value
+    @param[in] value             the value to consider
+    @param[in,out] result        the result object to insert values to
+
+    @note Empty objects or arrays are flattened to `null`.
+    */
+    static void flatten(const std::string& reference_string,
+                        const BasicJsonType& value,
+                        BasicJsonType& result)
+    {
+        switch (value.type())
+        {
+            case detail::value_t::array:
+            {
+                if (value.m_value.array->empty())
+                {
+                    // flatten empty array as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate array and use index as reference string
+                    for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
+                    {
+                        flatten(reference_string + "/" + std::to_string(i),
+                                value.m_value.array->operator[](i), result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::object:
+            {
+                if (value.m_value.object->empty())
+                {
+                    // flatten empty object as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate object and use keys as reference string
+                    for (const auto& element : *value.m_value.object)
+                    {
+                        flatten(reference_string + "/" + escape(element.first), element.second, result);
+                    }
+                }
+                break;
+            }
+
+            default:
+            {
+                // add primitive value with its reference string
+                result[reference_string] = value;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @param[in] value  flattened JSON
+
+    @return unflattened JSON
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+    @throw type_error.313  if value cannot be unflattened
+    */
+    static BasicJsonType
+    unflatten(const BasicJsonType& value)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!value.is_object()))
+        {
+            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened"));
+        }
+
+        BasicJsonType result;
+
+        // iterate the JSON object values
+        for (const auto& element : *value.m_value.object)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
+            {
+                JSON_THROW(detail::type_error::create(315, "values in object must be primitive"));
+            }
+
+            // assign value to reference pointed to by JSON pointer; Note that if
+            // the JSON pointer is "" (i.e., points to the whole value), function
+            // get_and_create returns a reference to result itself. An assignment
+            // will then create a primitive value.
+            json_pointer(element.first).get_and_create(result) = element.second;
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief compares two JSON pointers for equality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is equal to @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator==(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return lhs.reference_tokens == rhs.reference_tokens;
+    }
+
+    /*!
+    @brief compares two JSON pointers for inequality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is not equal @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator!=(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /// the reference tokens
+    std::vector<std::string> reference_tokens;
+};
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/json_ref.hpp>
+
+
+#include <initializer_list>
+#include <utility>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+class json_ref
+{
+  public:
+    using value_type = BasicJsonType;
+
+    json_ref(value_type&& value)
+        : owned_value(std::move(value))
+        , value_ref(&owned_value)
+        , is_rvalue(true)
+    {}
+
+    json_ref(const value_type& value)
+        : value_ref(const_cast<value_type*>(&value))
+        , is_rvalue(false)
+    {}
+
+    json_ref(std::initializer_list<json_ref> init)
+        : owned_value(init)
+        , value_ref(&owned_value)
+        , is_rvalue(true)
+    {}
+
+    template <
+        class... Args,
+        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
+    json_ref(Args && ... args)
+        : owned_value(std::forward<Args>(args)...)
+        , value_ref(&owned_value)
+        , is_rvalue(true)
+    {}
+
+    // class should be movable only
+    json_ref(json_ref&&) = default;
+    json_ref(const json_ref&) = delete;
+    json_ref& operator=(const json_ref&) = delete;
+    json_ref& operator=(json_ref&&) = delete;
+    ~json_ref() = default;
+
+    value_type moved_or_copied() const
+    {
+        if (is_rvalue)
+        {
+            return std::move(*value_ref);
+        }
+        return *value_ref;
+    }
+
+    value_type const& operator*() const
+    {
+        return *static_cast<value_type const*>(value_ref);
+    }
+
+    value_type const* operator->() const
+    {
+        return static_cast<value_type const*>(value_ref);
+    }
+
+  private:
+    mutable value_type owned_value = nullptr;
+    value_type* value_ref = nullptr;
+    const bool is_rvalue = true;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+
+#include <algorithm> // reverse
+#include <array> // array
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstring> // memcpy
+#include <limits> // numeric_limits
+#include <string> // string
+#include <cmath> // isnan, isinf
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+
+#include <algorithm> // copy
+#include <cstddef> // size_t
+#include <ios> // streamsize
+#include <iterator> // back_inserter
+#include <memory> // shared_ptr, make_shared
+#include <ostream> // basic_ostream
+#include <string> // basic_string
+#include <vector> // vector
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// abstract output adapter interface
+template<typename CharType> struct output_adapter_protocol
+{
+    virtual void write_character(CharType c) = 0;
+    virtual void write_characters(const CharType* s, std::size_t length) = 0;
+    virtual ~output_adapter_protocol() = default;
+};
+
+/// a type to simplify interfaces
+template<typename CharType>
+using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;
+
+/// output adapter for byte vectors
+template<typename CharType>
+class output_vector_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_vector_adapter(std::vector<CharType>& vec) noexcept
+        : v(vec)
+    {}
+
+    void write_character(CharType c) override
+    {
+        v.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        std::copy(s, s + length, std::back_inserter(v));
+    }
+
+  private:
+    std::vector<CharType>& v;
+};
+
+/// output adapter for output streams
+template<typename CharType>
+class output_stream_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
+        : stream(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        stream.put(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        stream.write(s, static_cast<std::streamsize>(length));
+    }
+
+  private:
+    std::basic_ostream<CharType>& stream;
+};
+
+/// output adapter for basic_string
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_string_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_string_adapter(StringType& s) noexcept
+        : str(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        str.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        str.append(s, length);
+    }
+
+  private:
+    StringType& str;
+};
+
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_adapter
+{
+  public:
+    output_adapter(std::vector<CharType>& vec)
+        : oa(std::make_shared<output_vector_adapter<CharType>>(vec)) {}
+
+    output_adapter(std::basic_ostream<CharType>& s)
+        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
+
+    output_adapter(StringType& s)
+        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
+
+    operator output_adapter_t<CharType>()
+    {
+        return oa;
+    }
+
+  private:
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// binary writer //
+///////////////////
+
+/*!
+@brief serialization to CBOR and MessagePack values
+*/
+template<typename BasicJsonType, typename CharType>
+class binary_writer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+  public:
+    /*!
+    @brief create a binary writer
+
+    @param[in] adapter  output adapter to write to
+    */
+    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(adapter)
+    {
+        JSON_ASSERT(oa);
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+            {
+                write_bson_object(*j.m_value.object);
+                break;
+            }
+
+            default:
+            {
+                JSON_THROW(type_error::create(317, "to serialize to BSON, top-level type must be object, but is " + std::string(j.type_name())));
+            }
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_cbor(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                oa->write_character(to_char_type(0xF6));
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xF5)
+                                    : to_char_type(0xF4));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // CBOR does not differentiate between positive signed
+                    // integers and unsigned integers. Therefore, we used the
+                    // code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_integer <= 0x17)
+                    {
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x18));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x19));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x1A));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x1B));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    // The conversions below encode the sign in the first
+                    // byte, and the value is converted to a positive number.
+                    const auto positive_number = -1 - j.m_value.number_integer;
+                    if (j.m_value.number_integer >= -24)
+                    {
+                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x38));
+                        write_number(static_cast<std::uint8_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x39));
+                        write_number(static_cast<std::uint16_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x3A));
+                        write_number(static_cast<std::uint32_t>(positive_number));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x3B));
+                        write_number(static_cast<std::uint64_t>(positive_number));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x18));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x19));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x1A));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_unsigned));
+                }
+                else
+                {
+                    oa->write_character(to_char_type(0x1B));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_unsigned));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                if (std::isnan(j.m_value.number_float))
+                {
+                    // NaN is 0xf97e00 in CBOR
+                    oa->write_character(to_char_type(0xF9));
+                    oa->write_character(to_char_type(0x7E));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else if (std::isinf(j.m_value.number_float))
+                {
+                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
+                    oa->write_character(to_char_type(0xf9));
+                    oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else
+                {
+                    write_compact_float(j.m_value.number_float, detail::input_format_t::cbor);
+                }
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x60 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x78));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x79));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x80 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x98));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x99));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_cbor(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (j.m_value.binary->has_subtype())
+                {
+                    write_number(static_cast<std::uint8_t>(0xd8));
+                    write_number(j.m_value.binary->subtype());
+                }
+
+                // step 1: write control byte and the binary array size
+                const auto N = j.m_value.binary->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x40 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x58));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x59));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0xA0 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB8));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB9));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBA));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBB));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_cbor(el.first);
+                    write_cbor(el.second);
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_msgpack(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null: // nil
+            {
+                oa->write_character(to_char_type(0xC0));
+                break;
+            }
+
+            case value_t::boolean: // true and false
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xC3)
+                                    : to_char_type(0xC2));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // MessagePack does not differentiate between positive
+                    // signed integers and unsigned integers. Therefore, we used
+                    // the code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_unsigned < 128)
+                    {
+                        // positive fixnum
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        // uint 8
+                        oa->write_character(to_char_type(0xCC));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        // uint 16
+                        oa->write_character(to_char_type(0xCD));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        // uint 32
+                        oa->write_character(to_char_type(0xCE));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                    {
+                        // uint 64
+                        oa->write_character(to_char_type(0xCF));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    if (j.m_value.number_integer >= -32)
+                    {
+                        // negative fixnum
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                    {
+                        // int 8
+                        oa->write_character(to_char_type(0xD0));
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                    {
+                        // int 16
+                        oa->write_character(to_char_type(0xD1));
+                        write_number(static_cast<std::int16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                    {
+                        // int 32
+                        oa->write_character(to_char_type(0xD2));
+                        write_number(static_cast<std::int32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                    {
+                        // int 64
+                        oa->write_character(to_char_type(0xD3));
+                        write_number(static_cast<std::int64_t>(j.m_value.number_integer));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned < 128)
+                {
+                    // positive fixnum
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // uint 8
+                    oa->write_character(to_char_type(0xCC));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // uint 16
+                    oa->write_character(to_char_type(0xCD));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // uint 32
+                    oa->write_character(to_char_type(0xCE));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    // uint 64
+                    oa->write_character(to_char_type(0xCF));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack);
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 31)
+                {
+                    // fixstr
+                    write_number(static_cast<std::uint8_t>(0xA0 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // str 8
+                    oa->write_character(to_char_type(0xD9));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // str 16
+                    oa->write_character(to_char_type(0xDA));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // str 32
+                    oa->write_character(to_char_type(0xDB));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 15)
+                {
+                    // fixarray
+                    write_number(static_cast<std::uint8_t>(0x90 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // array 16
+                    oa->write_character(to_char_type(0xDC));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // array 32
+                    oa->write_character(to_char_type(0xDD));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_msgpack(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                // step 0: determine if the binary type has a set subtype to
+                // determine whether or not to use the ext or fixext types
+                const bool use_ext = j.m_value.binary->has_subtype();
+
+                // step 1: write control byte and the byte string length
+                const auto N = j.m_value.binary->size();
+                if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    std::uint8_t output_type{};
+                    bool fixed = true;
+                    if (use_ext)
+                    {
+                        switch (N)
+                        {
+                            case 1:
+                                output_type = 0xD4; // fixext 1
+                                break;
+                            case 2:
+                                output_type = 0xD5; // fixext 2
+                                break;
+                            case 4:
+                                output_type = 0xD6; // fixext 4
+                                break;
+                            case 8:
+                                output_type = 0xD7; // fixext 8
+                                break;
+                            case 16:
+                                output_type = 0xD8; // fixext 16
+                                break;
+                            default:
+                                output_type = 0xC7; // ext 8
+                                fixed = false;
+                                break;
+                        }
+
+                    }
+                    else
+                    {
+                        output_type = 0xC4; // bin 8
+                        fixed = false;
+                    }
+
+                    oa->write_character(to_char_type(output_type));
+                    if (!fixed)
+                    {
+                        write_number(static_cast<std::uint8_t>(N));
+                    }
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    std::uint8_t output_type = use_ext
+                                               ? 0xC8 // ext 16
+                                               : 0xC5; // bin 16
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    std::uint8_t output_type = use_ext
+                                               ? 0xC9 // ext 32
+                                               : 0xC6; // bin 32
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 1.5: if this is an ext type, write the subtype
+                if (use_ext)
+                {
+                    write_number(static_cast<std::int8_t>(j.m_value.binary->subtype()));
+                }
+
+                // step 2: write the byte string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 15)
+                {
+                    // fixmap
+                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // map 16
+                    oa->write_character(to_char_type(0xDE));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // map 32
+                    oa->write_character(to_char_type(0xDF));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_msgpack(el.first);
+                    write_msgpack(el.second);
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @param[in] use_count   whether to use '#' prefixes (optimized format)
+    @param[in] use_type    whether to use '$' prefixes (optimized format)
+    @param[in] add_prefix  whether prefixes need to be used for this value
+    */
+    void write_ubjson(const BasicJsonType& j, const bool use_count,
+                      const bool use_type, const bool add_prefix = true)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('Z'));
+                }
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(j.m_value.boolean
+                                        ? to_char_type('T')
+                                        : to_char_type('F'));
+                }
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix);
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix);
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix);
+                break;
+            }
+
+            case value_t::string:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('S'));
+                }
+                write_number_with_ubjson_prefix(j.m_value.string->size(), true);
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_value.array->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.array->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_ubjson(el, use_count, use_type, prefix_required);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                if (use_type && !j.m_value.binary->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    oa->write_character(to_char_type('$'));
+                    oa->write_character('U');
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.binary->size(), true);
+                }
+
+                if (use_type)
+                {
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                        j.m_value.binary->size());
+                }
+                else
+                {
+                    for (size_t i = 0; i < j.m_value.binary->size(); ++i)
+                    {
+                        oa->write_character(to_char_type('U'));
+                        oa->write_character(j.m_value.binary->data()[i]);
+                    }
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('{'));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_value.object->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin(), j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.object->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_number_with_ubjson_prefix(el.first.size(), true);
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(el.first.c_str()),
+                        el.first.size());
+                    write_ubjson(el.second, use_count, use_type, prefix_required);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type('}'));
+                }
+
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @return The size of a BSON document entry header, including the id marker
+            and the entry name size (and its null-terminator).
+    */
+    static std::size_t calc_bson_entry_header_size(const string_t& name)
+    {
+        const auto it = name.find(static_cast<typename string_t::value_type>(0));
+        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
+        {
+            JSON_THROW(out_of_range::create(409,
+                                            "BSON key cannot contain code point U+0000 (at byte " + std::to_string(it) + ")"));
+        }
+
+        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
+    }
+
+    /*!
+    @brief Writes the given @a element_type and @a name to the output adapter
+    */
+    void write_bson_entry_header(const string_t& name,
+                                 const std::uint8_t element_type)
+    {
+        oa->write_character(to_char_type(element_type)); // boolean
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(name.c_str()),
+            name.size() + 1u);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and boolean value @a value
+    */
+    void write_bson_boolean(const string_t& name,
+                            const bool value)
+    {
+        write_bson_entry_header(name, 0x08);
+        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and double value @a value
+    */
+    void write_bson_double(const string_t& name,
+                           const double value)
+    {
+        write_bson_entry_header(name, 0x01);
+        write_number<double, true>(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded string in @a value
+    */
+    static std::size_t calc_bson_string_size(const string_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and string value @a value
+    */
+    void write_bson_string(const string_t& name,
+                           const string_t& value)
+    {
+        write_bson_entry_header(name, 0x02);
+
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size() + 1ul));
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(value.c_str()),
+            value.size() + 1);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and null value
+    */
+    void write_bson_null(const string_t& name)
+    {
+        write_bson_entry_header(name, 0x0A);
+    }
+
+    /*!
+    @return The size of the BSON-encoded integer @a value
+    */
+    static std::size_t calc_bson_integer_size(const std::int64_t value)
+    {
+        return (std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)()
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and integer @a value
+    */
+    void write_bson_integer(const string_t& name,
+                            const std::int64_t value)
+    {
+        if ((std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            write_bson_entry_header(name, 0x10); // int32
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
+        }
+        else
+        {
+            write_bson_entry_header(name, 0x12); // int64
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
+        }
+    }
+
+    /*!
+    @return The size of the BSON-encoded unsigned integer in @a j
+    */
+    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
+    {
+        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and unsigned @a value
+    */
+    void write_bson_unsigned(const string_t& name,
+                             const std::uint64_t value)
+    {
+        if (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x10 /* int32 */);
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
+        }
+        else if (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x12 /* int64 */);
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
+        }
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(value) + " cannot be represented by BSON as it does not fit int64"));
+        }
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and object @a value
+    */
+    void write_bson_object_entry(const string_t& name,
+                                 const typename BasicJsonType::object_t& value)
+    {
+        write_bson_entry_header(name, 0x03); // object
+        write_bson_object(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded array @a value
+    */
+    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
+    {
+        std::size_t array_index = 0ul;
+
+        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), std::size_t(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
+        {
+            return result + calc_bson_element_size(std::to_string(array_index++), el);
+        });
+
+        return sizeof(std::int32_t) + embedded_document_size + 1ul;
+    }
+
+    /*!
+    @return The size of the BSON-encoded binary array @a value
+    */
+    static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and array @a value
+    */
+    void write_bson_array(const string_t& name,
+                          const typename BasicJsonType::array_t& value)
+    {
+        write_bson_entry_header(name, 0x04); // array
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_array_size(value)));
+
+        std::size_t array_index = 0ul;
+
+        for (const auto& el : value)
+        {
+            write_bson_element(std::to_string(array_index++), el);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and binary value @a value
+    */
+    void write_bson_binary(const string_t& name,
+                           const binary_t& value)
+    {
+        write_bson_entry_header(name, 0x05);
+
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size()));
+        write_number(value.has_subtype() ? value.subtype() : std::uint8_t(0x00));
+
+        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
+    }
+
+    /*!
+    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
+    @return The calculated size for the BSON document entry for @a j with the given @a name.
+    */
+    static std::size_t calc_bson_element_size(const string_t& name,
+            const BasicJsonType& j)
+    {
+        const auto header_size = calc_bson_entry_header_size(name);
+        switch (j.type())
+        {
+            case value_t::object:
+                return header_size + calc_bson_object_size(*j.m_value.object);
+
+            case value_t::array:
+                return header_size + calc_bson_array_size(*j.m_value.array);
+
+            case value_t::binary:
+                return header_size + calc_bson_binary_size(*j.m_value.binary);
+
+            case value_t::boolean:
+                return header_size + 1ul;
+
+            case value_t::number_float:
+                return header_size + 8ul;
+
+            case value_t::number_integer:
+                return header_size + calc_bson_integer_size(j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned);
+
+            case value_t::string:
+                return header_size + calc_bson_string_size(*j.m_value.string);
+
+            case value_t::null:
+                return header_size + 0ul;
+
+            // LCOV_EXCL_START
+            default:
+                JSON_ASSERT(false);
+                return 0ul;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Serializes the JSON value @a j to BSON and associates it with the
+           key @a name.
+    @param name The name to associate with the JSON entity @a j within the
+                current BSON document
+    @return The size of the BSON entry
+    */
+    void write_bson_element(const string_t& name,
+                            const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+                return write_bson_object_entry(name, *j.m_value.object);
+
+            case value_t::array:
+                return write_bson_array(name, *j.m_value.array);
+
+            case value_t::binary:
+                return write_bson_binary(name, *j.m_value.binary);
+
+            case value_t::boolean:
+                return write_bson_boolean(name, j.m_value.boolean);
+
+            case value_t::number_float:
+                return write_bson_double(name, j.m_value.number_float);
+
+            case value_t::number_integer:
+                return write_bson_integer(name, j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return write_bson_unsigned(name, j.m_value.number_unsigned);
+
+            case value_t::string:
+                return write_bson_string(name, *j.m_value.string);
+
+            case value_t::null:
+                return write_bson_null(name);
+
+            // LCOV_EXCL_START
+            default:
+                JSON_ASSERT(false);
+                return;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Calculates the size of the BSON serialization of the given
+           JSON-object @a j.
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
+    {
+        std::size_t document_size = std::accumulate(value.begin(), value.end(), std::size_t(0),
+                                    [](size_t result, const typename BasicJsonType::object_t::value_type & el)
+        {
+            return result += calc_bson_element_size(el.first, el.second);
+        });
+
+        return sizeof(std::int32_t) + document_size + 1ul;
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson_object(const typename BasicJsonType::object_t& value)
+    {
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_object_size(value)));
+
+        for (const auto& el : value)
+        {
+            write_bson_element(el.first, el.second);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xFA);  // Single-Precision Float
+    }
+
+    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xFB);  // Double-Precision Float
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xCA);  // float 32
+    }
+
+    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xCB);  // float 64
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    // UBJSON: write number (floating point)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (add_prefix)
+        {
+            oa->write_character(get_ubjson_float_prefix(n));
+        }
+        write_number(n);
+    }
+
+    // UBJSON: write number (unsigned integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_unsigned<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+    }
+
+    // UBJSON: write number (signed integer)
+    template < typename NumberType, typename std::enable_if <
+                   std::is_signed<NumberType>::value&&
+                   !std::is_floating_point<NumberType>::value, int >::type = 0 >
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::int8_t>(n));
+        }
+        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        // LCOV_EXCL_START
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    /*!
+    @brief determine the type prefix of container values
+    */
+    CharType ubjson_prefix(const BasicJsonType& j) const noexcept
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+                return 'Z';
+
+            case value_t::boolean:
+                return j.m_value.boolean ? 'T' : 'F';
+
+            case value_t::number_integer:
+            {
+                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                {
+                    return 'i';
+                }
+                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    return 'U';
+                }
+                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                {
+                    return 'I';
+                }
+                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                {
+                    return 'l';
+                }
+                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+                {
+                    return 'i';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
+                {
+                    return 'U';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+                {
+                    return 'I';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+                {
+                    return 'l';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+                {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_float:
+                return get_ubjson_float_prefix(j.m_value.number_float);
+
+            case value_t::string:
+                return 'S';
+
+            case value_t::array: // fallthrough
+            case value_t::binary:
+                return '[';
+
+            case value_t::object:
+                return '{';
+
+            default:  // discarded values
+                return 'N';
+        }
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
+    {
+        return 'd';  // float 32
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
+    {
+        return 'D';  // float 64
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*
+    @brief write a number to output input
+    @param[in] n number of type @a NumberType
+    @tparam NumberType the type of the number
+    @tparam OutputIsLittleEndian Set to true if output data is
+                                 required to be little endian
+
+    @note This function needs to respect the system's endianess, because bytes
+          in CBOR, MessagePack, and UBJSON are stored in network order (big
+          endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool OutputIsLittleEndian = false>
+    void write_number(const NumberType n)
+    {
+        // step 1: write number to array of length NumberType
+        std::array<CharType, sizeof(NumberType)> vec;
+        std::memcpy(vec.data(), &n, sizeof(NumberType));
+
+        // step 2: write array to output (with possible reordering)
+        if (is_little_endian != OutputIsLittleEndian)
+        {
+            // reverse byte order prior to conversion if necessary
+            std::reverse(vec.begin(), vec.end());
+        }
+
+        oa->write_characters(vec.data(), sizeof(NumberType));
+    }
+
+    void write_compact_float(const number_float_t n, detail::input_format_t format)
+    {
+        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
+                static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
+                static_cast<double>(static_cast<float>(n)) == static_cast<double>(n))
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(static_cast<float>(n))
+                                : get_msgpack_float_prefix(static_cast<float>(n)));
+            write_number(static_cast<float>(n));
+        }
+        else
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(n)
+                                : get_msgpack_float_prefix(n));
+            write_number(n);
+        }
+    }
+
+  public:
+    // The following to_char_type functions are implement the conversion
+    // between uint8_t and CharType. In case CharType is not unsigned,
+    // such a conversion is required to allow values greater than 128.
+    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_signed<char>::value > * = nullptr >
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return *reinterpret_cast<char*>(&x);
+    }
+
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_unsigned<char>::value > * = nullptr >
+    static CharType to_char_type(std::uint8_t x) noexcept
+    {
+        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
+        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
+        CharType result;
+        std::memcpy(&result, &x, sizeof(x));
+        return result;
+    }
+
+    template<typename C = CharType,
+             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return x;
+    }
+
+    template < typename InputCharType, typename C = CharType,
+               enable_if_t <
+                   std::is_signed<C>::value &&
+                   std::is_signed<char>::value &&
+                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
+                   > * = nullptr >
+    static constexpr CharType to_char_type(InputCharType x) noexcept
+    {
+        return x;
+    }
+
+  private:
+    /// whether we can assume little endianess
+    const bool is_little_endian = little_endianess();
+
+    /// the output
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/output/serializer.hpp>
+
+
+#include <algorithm> // reverse, remove, fill, find, none_of
+#include <array> // array
+#include <clocale> // localeconv, lconv
+#include <cmath> // labs, isfinite, isnan, signbit
+#include <cstddef> // size_t, ptrdiff_t
+#include <cstdint> // uint8_t
+#include <cstdio> // snprintf
+#include <limits> // numeric_limits
+#include <string> // string, char_traits
+#include <type_traits> // is_same
+#include <utility> // move
+
+// #include <nlohmann/detail/conversions/to_chars.hpp>
+
+
+#include <array> // array
+#include <cmath>   // signbit, isfinite
+#include <cstdint> // intN_t, uintN_t
+#include <cstring> // memcpy, memmove
+#include <limits> // numeric_limits
+#include <type_traits> // conditional
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/*!
+@brief implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+
+The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
+
+For a detailed description of the algorithm see:
+
+[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
+    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
+    Language Design and Implementation, PLDI 2010
+[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
+    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
+    Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl
+{
+
+template<typename Target, typename Source>
+Target reinterpret_bits(const Source source)
+{
+    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+    Target target;
+    std::memcpy(&target, &source, sizeof(Source));
+    return target;
+}
+
+struct diyfp // f * 2^e
+{
+    static constexpr int kPrecision = 64; // = q
+
+    std::uint64_t f = 0;
+    int e = 0;
+
+    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+    /*!
+    @brief returns x - y
+    @pre x.e == y.e and x.f >= y.f
+    */
+    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
+    {
+        JSON_ASSERT(x.e == y.e);
+        JSON_ASSERT(x.f >= y.f);
+
+        return {x.f - y.f, x.e};
+    }
+
+    /*!
+    @brief returns x * y
+    @note The result is rounded. (Only the upper q bits are returned.)
+    */
+    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
+    {
+        static_assert(kPrecision == 64, "internal error");
+
+        // Computes:
+        //  f = round((x.f * y.f) / 2^q)
+        //  e = x.e + y.e + q
+
+        // Emulate the 64-bit * 64-bit multiplication:
+        //
+        // p = u * v
+        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
+        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
+        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
+        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
+        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
+        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
+        //
+        // (Since Q might be larger than 2^32 - 1)
+        //
+        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+        //
+        // (Q_hi + H does not overflow a 64-bit int)
+        //
+        //   = p_lo + 2^64 p_hi
+
+        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
+        const std::uint64_t u_hi = x.f >> 32u;
+        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
+        const std::uint64_t v_hi = y.f >> 32u;
+
+        const std::uint64_t p0 = u_lo * v_lo;
+        const std::uint64_t p1 = u_lo * v_hi;
+        const std::uint64_t p2 = u_hi * v_lo;
+        const std::uint64_t p3 = u_hi * v_hi;
+
+        const std::uint64_t p0_hi = p0 >> 32u;
+        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
+        const std::uint64_t p1_hi = p1 >> 32u;
+        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
+        const std::uint64_t p2_hi = p2 >> 32u;
+
+        std::uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+        // The full product might now be computed as
+        //
+        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+        // p_lo = p0_lo + (Q << 32)
+        //
+        // But in this particular case here, the full p_lo is not required.
+        // Effectively we only need to add the highest bit in p_lo to p_hi (and
+        // Q_hi + 1 does not overflow).
+
+        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
+
+        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
+
+        return {h, x.e + y.e + 64};
+    }
+
+    /*!
+    @brief normalize x such that the significand is >= 2^(q-1)
+    @pre x.f != 0
+    */
+    static diyfp normalize(diyfp x) noexcept
+    {
+        JSON_ASSERT(x.f != 0);
+
+        while ((x.f >> 63u) == 0)
+        {
+            x.f <<= 1u;
+            x.e--;
+        }
+
+        return x;
+    }
+
+    /*!
+    @brief normalize x such that the result has the exponent E
+    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+    */
+    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
+    {
+        const int delta = x.e - target_exponent;
+
+        JSON_ASSERT(delta >= 0);
+        JSON_ASSERT(((x.f << delta) >> delta) == x.f);
+
+        return {x.f << delta, target_exponent};
+    }
+};
+
+struct boundaries
+{
+    diyfp w;
+    diyfp minus;
+    diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+
+@pre value must be finite and positive
+*/
+template<typename FloatType>
+boundaries compute_boundaries(FloatType value)
+{
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // Convert the IEEE representation into a diyfp.
+    //
+    // If v is denormal:
+    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+    // If v is normalized:
+    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+    static_assert(std::numeric_limits<FloatType>::is_iec559,
+                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
+
+    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
+    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+    constexpr int      kMinExp    = 1 - kBias;
+    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)
+
+    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;
+
+    const std::uint64_t bits = reinterpret_bits<bits_type>(value);
+    const std::uint64_t E = bits >> (kPrecision - 1);
+    const std::uint64_t F = bits & (kHiddenBit - 1);
+
+    const bool is_denormal = E == 0;
+    const diyfp v = is_denormal
+                    ? diyfp(F, kMinExp)
+                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+    // Compute the boundaries m- and m+ of the floating-point value
+    // v = f * 2^e.
+    //
+    // Determine v- and v+, the floating-point predecessor and successor if v,
+    // respectively.
+    //
+    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+    //
+    //      v+ = v + 2^e
+    //
+    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+    // between m- and m+ round to v, regardless of how the input rounding
+    // algorithm breaks ties.
+    //
+    //      ---+-------------+-------------+-------------+-------------+---  (A)
+    //         v-            m-            v             m+            v+
+    //
+    //      -----------------+------+------+-------------+-------------+---  (B)
+    //                       v-     m-     v             m+            v+
+
+    const bool lower_boundary_is_closer = F == 0 && E > 1;
+    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
+    const diyfp m_minus = lower_boundary_is_closer
+                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
+                          : diyfp(2 * v.f - 1, v.e - 1); // (A)
+
+    // Determine the normalized w+ = m+.
+    const diyfp w_plus = diyfp::normalize(m_plus);
+
+    // Determine w- = m- such that e_(w-) = e_(w+).
+    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+    return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power // c = f * 2^e ~= 10^k
+{
+    std::uint64_t f;
+    int e;
+    int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power get_cached_power_for_binary_exponent(int e)
+{
+    // Now
+    //
+    //      alpha <= e_c + e + q <= gamma                                    (1)
+    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+    //
+    // and since the c's are normalized, 2^(q-1) <= f_c,
+    //
+    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+    //      ==> 2^(alpha - e - 1) <= c
+    //
+    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
+    //
+    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+    //        = ceil( (alpha - e - 1) * log_10(2) )
+    //
+    // From the paper:
+    // "In theory the result of the procedure could be wrong since c is rounded,
+    //  and the computation itself is approximated [...]. In practice, however,
+    //  this simple function is sufficient."
+    //
+    // For IEEE double precision floating-point numbers converted into
+    // normalized diyfp's w = f * 2^e, with q = 64,
+    //
+    //      e >= -1022      (min IEEE exponent)
+    //           -52        (p - 1)
+    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+    //           -11        (normalize the diyfp)
+    //         = -1137
+    //
+    // and
+    //
+    //      e <= +1023      (max IEEE exponent)
+    //           -52        (p - 1)
+    //           -11        (normalize the diyfp)
+    //         = 960
+    //
+    // This binary exponent range [-1137,960] results in a decimal exponent
+    // range [-307,324]. One does not need to store a cached power for each
+    // k in this range. For each such k it suffices to find a cached power
+    // such that the exponent of the product lies in [alpha,gamma].
+    // This implies that the difference of the decimal exponents of adjacent
+    // table entries must be less than or equal to
+    //
+    //      floor( (gamma - alpha) * log_10(2) ) = 8.
+    //
+    // (A smaller distance gamma-alpha would require a larger table.)
+
+    // NB:
+    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+    constexpr int kCachedPowersMinDecExp = -300;
+    constexpr int kCachedPowersDecStep = 8;
+
+    static constexpr std::array<cached_power, 79> kCachedPowers =
+    {
+        {
+            { 0xAB70FE17C79AC6CA, -1060, -300 },
+            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
+            { 0xBE5691EF416BD60C, -1007, -284 },
+            { 0x8DD01FAD907FFC3C,  -980, -276 },
+            { 0xD3515C2831559A83,  -954, -268 },
+            { 0x9D71AC8FADA6C9B5,  -927, -260 },
+            { 0xEA9C227723EE8BCB,  -901, -252 },
+            { 0xAECC49914078536D,  -874, -244 },
+            { 0x823C12795DB6CE57,  -847, -236 },
+            { 0xC21094364DFB5637,  -821, -228 },
+            { 0x9096EA6F3848984F,  -794, -220 },
+            { 0xD77485CB25823AC7,  -768, -212 },
+            { 0xA086CFCD97BF97F4,  -741, -204 },
+            { 0xEF340A98172AACE5,  -715, -196 },
+            { 0xB23867FB2A35B28E,  -688, -188 },
+            { 0x84C8D4DFD2C63F3B,  -661, -180 },
+            { 0xC5DD44271AD3CDBA,  -635, -172 },
+            { 0x936B9FCEBB25C996,  -608, -164 },
+            { 0xDBAC6C247D62A584,  -582, -156 },
+            { 0xA3AB66580D5FDAF6,  -555, -148 },
+            { 0xF3E2F893DEC3F126,  -529, -140 },
+            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
+            { 0x87625F056C7C4A8B,  -475, -124 },
+            { 0xC9BCFF6034C13053,  -449, -116 },
+            { 0x964E858C91BA2655,  -422, -108 },
+            { 0xDFF9772470297EBD,  -396, -100 },
+            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
+            { 0xF8A95FCF88747D94,  -343,  -84 },
+            { 0xB94470938FA89BCF,  -316,  -76 },
+            { 0x8A08F0F8BF0F156B,  -289,  -68 },
+            { 0xCDB02555653131B6,  -263,  -60 },
+            { 0x993FE2C6D07B7FAC,  -236,  -52 },
+            { 0xE45C10C42A2B3B06,  -210,  -44 },
+            { 0xAA242499697392D3,  -183,  -36 },
+            { 0xFD87B5F28300CA0E,  -157,  -28 },
+            { 0xBCE5086492111AEB,  -130,  -20 },
+            { 0x8CBCCC096F5088CC,  -103,  -12 },
+            { 0xD1B71758E219652C,   -77,   -4 },
+            { 0x9C40000000000000,   -50,    4 },
+            { 0xE8D4A51000000000,   -24,   12 },
+            { 0xAD78EBC5AC620000,     3,   20 },
+            { 0x813F3978F8940984,    30,   28 },
+            { 0xC097CE7BC90715B3,    56,   36 },
+            { 0x8F7E32CE7BEA5C70,    83,   44 },
+            { 0xD5D238A4ABE98068,   109,   52 },
+            { 0x9F4F2726179A2245,   136,   60 },
+            { 0xED63A231D4C4FB27,   162,   68 },
+            { 0xB0DE65388CC8ADA8,   189,   76 },
+            { 0x83C7088E1AAB65DB,   216,   84 },
+            { 0xC45D1DF942711D9A,   242,   92 },
+            { 0x924D692CA61BE758,   269,  100 },
+            { 0xDA01EE641A708DEA,   295,  108 },
+            { 0xA26DA3999AEF774A,   322,  116 },
+            { 0xF209787BB47D6B85,   348,  124 },
+            { 0xB454E4A179DD1877,   375,  132 },
+            { 0x865B86925B9BC5C2,   402,  140 },
+            { 0xC83553C5C8965D3D,   428,  148 },
+            { 0x952AB45CFA97A0B3,   455,  156 },
+            { 0xDE469FBD99A05FE3,   481,  164 },
+            { 0xA59BC234DB398C25,   508,  172 },
+            { 0xF6C69A72A3989F5C,   534,  180 },
+            { 0xB7DCBF5354E9BECE,   561,  188 },
+            { 0x88FCF317F22241E2,   588,  196 },
+            { 0xCC20CE9BD35C78A5,   614,  204 },
+            { 0x98165AF37B2153DF,   641,  212 },
+            { 0xE2A0B5DC971F303A,   667,  220 },
+            { 0xA8D9D1535CE3B396,   694,  228 },
+            { 0xFB9B7CD9A4A7443C,   720,  236 },
+            { 0xBB764C4CA7A44410,   747,  244 },
+            { 0x8BAB8EEFB6409C1A,   774,  252 },
+            { 0xD01FEF10A657842C,   800,  260 },
+            { 0x9B10A4E5E9913129,   827,  268 },
+            { 0xE7109BFBA19C0C9D,   853,  276 },
+            { 0xAC2820D9623BF429,   880,  284 },
+            { 0x80444B5E7AA7CF85,   907,  292 },
+            { 0xBF21E44003ACDD2D,   933,  300 },
+            { 0x8E679C2F5E44FF8F,   960,  308 },
+            { 0xD433179D9C8CB841,   986,  316 },
+            { 0x9E19DB92B4E31BA9,  1013,  324 },
+        }
+    };
+
+    // This computation gives exactly the same results for k as
+    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+    // for |e| <= 1500, but doesn't require floating-point operations.
+    // NB: log_10(2) ~= 78913 / 2^18
+    JSON_ASSERT(e >= -1500);
+    JSON_ASSERT(e <=  1500);
+    const int f = kAlpha - e - 1;
+    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
+
+    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
+    JSON_ASSERT(index >= 0);
+    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());
+
+    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
+    JSON_ASSERT(kAlpha <= cached.e + e + 64);
+    JSON_ASSERT(kGamma >= cached.e + e + 64);
+
+    return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
+{
+    // LCOV_EXCL_START
+    if (n >= 1000000000)
+    {
+        pow10 = 1000000000;
+        return 10;
+    }
+    // LCOV_EXCL_STOP
+    else if (n >= 100000000)
+    {
+        pow10 = 100000000;
+        return  9;
+    }
+    else if (n >= 10000000)
+    {
+        pow10 = 10000000;
+        return  8;
+    }
+    else if (n >= 1000000)
+    {
+        pow10 = 1000000;
+        return  7;
+    }
+    else if (n >= 100000)
+    {
+        pow10 = 100000;
+        return  6;
+    }
+    else if (n >= 10000)
+    {
+        pow10 = 10000;
+        return  5;
+    }
+    else if (n >= 1000)
+    {
+        pow10 = 1000;
+        return  4;
+    }
+    else if (n >= 100)
+    {
+        pow10 = 100;
+        return  3;
+    }
+    else if (n >= 10)
+    {
+        pow10 = 10;
+        return  2;
+    }
+    else
+    {
+        pow10 = 1;
+        return 1;
+    }
+}
+
+inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
+                         std::uint64_t rest, std::uint64_t ten_k)
+{
+    JSON_ASSERT(len >= 1);
+    JSON_ASSERT(dist <= delta);
+    JSON_ASSERT(rest <= delta);
+    JSON_ASSERT(ten_k > 0);
+
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    //                                  ten_k
+    //                                <------>
+    //                                       <---- rest ---->
+    // --------------[------------------+----+--------------]--------------
+    //                                  w    V
+    //                                       = buf * 10^k
+    //
+    // ten_k represents a unit-in-the-last-place in the decimal representation
+    // stored in buf.
+    // Decrement buf by ten_k while this takes buf closer to w.
+
+    // The tests are written in this order to avoid overflow in unsigned
+    // integer arithmetic.
+
+    while (rest < dist
+            && delta - rest >= ten_k
+            && (rest + ten_k < dist || dist - rest > rest + ten_k - dist))
+    {
+        JSON_ASSERT(buf[len - 1] != '0');
+        buf[len - 1]--;
+        rest += ten_k;
+    }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
+                             diyfp M_minus, diyfp w, diyfp M_plus)
+{
+    static_assert(kAlpha >= -60, "internal error");
+    static_assert(kGamma <= -32, "internal error");
+
+    // Generates the digits (and the exponent) of a decimal floating-point
+    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
+    //
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    // Grisu2 generates the digits of M+ from left to right and stops as soon as
+    // V is in [M-,M+].
+
+    JSON_ASSERT(M_plus.e >= kAlpha);
+    JSON_ASSERT(M_plus.e <= kGamma);
+
+    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
+    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)
+
+    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+    //
+    //      M+ = f * 2^e
+    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+    //         = p1 + p2 * 2^e
+
+    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
+
+    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e
+
+    // 1)
+    //
+    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+    JSON_ASSERT(p1 > 0);
+
+    std::uint32_t pow10;
+    const int k = find_largest_pow10(p1, pow10);
+
+    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+    //
+    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+    //
+    //      M+ = p1                                             + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+    //
+    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+    //
+    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+    //
+    // but stop as soon as
+    //
+    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+    int n = k;
+    while (n > 0)
+    {
+        // Invariants:
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        //
+        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
+        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
+        //
+        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+        //
+        p1 = r;
+        n--;
+        //
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+        //      pow10 = 10^n
+        //
+
+        // Now check if enough digits have been generated.
+        // Compute
+        //
+        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+        //
+        // Note:
+        // Since rest and delta share the same exponent e, it suffices to
+        // compare the significands.
+        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
+        if (rest <= delta)
+        {
+            // V = buffer * 10^n, with M- <= V <= M+.
+
+            decimal_exponent += n;
+
+            // We may now just stop. But instead look if the buffer could be
+            // decremented to bring V closer to w.
+            //
+            // pow10 = 10^n is now 1 ulp in the decimal representation V.
+            // The rounding procedure works with diyfp's with an implicit
+            // exponent of e.
+            //
+            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+            //
+            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
+            grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+            return;
+        }
+
+        pow10 /= 10;
+        //
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        // Invariants restored.
+    }
+
+    // 2)
+    //
+    // The digits of the integral part have been generated:
+    //
+    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+    //         = buffer            + p2 * 2^e
+    //
+    // Now generate the digits of the fractional part p2 * 2^e.
+    //
+    // Note:
+    // No decimal point is generated: the exponent is adjusted instead.
+    //
+    // p2 actually represents the fraction
+    //
+    //      p2 * 2^e
+    //          = p2 / 2^-e
+    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+    //
+    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+    //
+    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+    //
+    // using
+    //
+    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+    //                = (                   d) * 2^-e + (                   r)
+    //
+    // or
+    //      10^m * p2 * 2^e = d + r * 2^e
+    //
+    // i.e.
+    //
+    //      M+ = buffer + p2 * 2^e
+    //         = buffer + 10^-m * (d + r * 2^e)
+    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+    //
+    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+    JSON_ASSERT(p2 > delta);
+
+    int m = 0;
+    for (;;)
+    {
+        // Invariant:
+        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
+        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
+        //
+        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
+        p2 *= 10;
+        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
+        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
+        //
+        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        p2 = r;
+        m++;
+        //
+        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+        // Invariant restored.
+
+        // Check if enough digits have been generated.
+        //
+        //      10^-m * p2 * 2^e <= delta * 2^e
+        //              p2 * 2^e <= 10^m * delta * 2^e
+        //                    p2 <= 10^m * delta
+        delta *= 10;
+        dist  *= 10;
+        if (p2 <= delta)
+        {
+            break;
+        }
+    }
+
+    // V = buffer * 10^-m, with M- <= V <= M+.
+
+    decimal_exponent -= m;
+
+    // 1 ulp in the decimal representation is now 10^-m.
+    // Since delta and dist are now scaled by 10^m, we need to do the
+    // same with ulp in order to keep the units in sync.
+    //
+    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+    //
+    const std::uint64_t ten_m = one.f;
+    grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+    // By construction this algorithm generates the shortest possible decimal
+    // number (Loitsch, Theorem 6.2) which rounds back to w.
+    // For an input number of precision p, at least
+    //
+    //      N = 1 + ceil(p * log_10(2))
+    //
+    // decimal digits are sufficient to identify all binary floating-point
+    // numbers (Matula, "In-and-Out conversions").
+    // This implies that the algorithm does not produce more than N decimal
+    // digits.
+    //
+    //      N = 17 for p = 53 (IEEE double precision)
+    //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline void grisu2(char* buf, int& len, int& decimal_exponent,
+                   diyfp m_minus, diyfp v, diyfp m_plus)
+{
+    JSON_ASSERT(m_plus.e == m_minus.e);
+    JSON_ASSERT(m_plus.e == v.e);
+
+    //  --------(-----------------------+-----------------------)--------    (A)
+    //          m-                      v                       m+
+    //
+    //  --------------------(-----------+-----------------------)--------    (B)
+    //                      m-          v                       m+
+    //
+    // First scale v (and m- and m+) such that the exponent is in the range
+    // [alpha, gamma].
+
+    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
+
+    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
+    const diyfp w       = diyfp::mul(v,       c_minus_k);
+    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);
+
+    //  ----(---+---)---------------(---+---)---------------(---+---)----
+    //          w-                      w                       w+
+    //          = c*m-                  = c*v                   = c*m+
+    //
+    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+    // w+ are now off by a small amount.
+    // In fact:
+    //
+    //      w - v * 10^k < 1 ulp
+    //
+    // To account for this inaccuracy, add resp. subtract 1 ulp.
+    //
+    //  --------+---[---------------(---+---)---------------]---+--------
+    //          w-  M-                  w                   M+  w+
+    //
+    // Now any number in [M-, M+] (bounds included) will round to w when input,
+    // regardless of how the input rounding algorithm breaks ties.
+    //
+    // And digit_gen generates the shortest possible such number in [M-, M+].
+    // Note that this does not mean that Grisu2 always generates the shortest
+    // possible number in the interval (m-, m+).
+    const diyfp M_minus(w_minus.f + 1, w_minus.e);
+    const diyfp M_plus (w_plus.f  - 1, w_plus.e );
+
+    decimal_exponent = -cached.k; // = -(-k) = k
+
+    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1)
+void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
+{
+    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                  "internal error: not enough precision");
+
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
+    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
+    // decimal representations are not exactly "short".
+    //
+    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
+    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
+    // and since sprintf promotes float's to double's, I think this is exactly what 'std::to_chars'
+    // does.
+    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
+    // representation using the corresponding std::from_chars function recovers value exactly". That
+    // indicates that single precision floating-point numbers should be recovered using
+    // 'std::strtof'.
+    //
+    // NB: If the neighbors are computed for single-precision numbers, there is a single float
+    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
+    //     value is off by 1 ulp.
+#if 0
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+    const boundaries w = compute_boundaries(value);
+#endif
+
+    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* append_exponent(char* buf, int e)
+{
+    JSON_ASSERT(e > -1000);
+    JSON_ASSERT(e <  1000);
+
+    if (e < 0)
+    {
+        e = -e;
+        *buf++ = '-';
+    }
+    else
+    {
+        *buf++ = '+';
+    }
+
+    auto k = static_cast<std::uint32_t>(e);
+    if (k < 10)
+    {
+        // Always print at least two digits in the exponent.
+        // This is for compatibility with printf("%g").
+        *buf++ = '0';
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else if (k < 100)
+    {
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else
+    {
+        *buf++ = static_cast<char>('0' + k / 100);
+        k %= 100;
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+
+    return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* format_buffer(char* buf, int len, int decimal_exponent,
+                           int min_exp, int max_exp)
+{
+    JSON_ASSERT(min_exp < 0);
+    JSON_ASSERT(max_exp > 0);
+
+    const int k = len;
+    const int n = len + decimal_exponent;
+
+    // v = buf * 10^(n-k)
+    // k is the length of the buffer (number of decimal digits)
+    // n is the position of the decimal point relative to the start of the buffer.
+
+    if (k <= n && n <= max_exp)
+    {
+        // digits[000]
+        // len <= max_exp + 2
+
+        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
+        // Make it look like a floating-point number (#362, #378)
+        buf[n + 0] = '.';
+        buf[n + 1] = '0';
+        return buf + (static_cast<size_t>(n) + 2);
+    }
+
+    if (0 < n && n <= max_exp)
+    {
+        // dig.its
+        // len <= max_digits10 + 1
+
+        JSON_ASSERT(k > n);
+
+        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
+        buf[n] = '.';
+        return buf + (static_cast<size_t>(k) + 1U);
+    }
+
+    if (min_exp < n && n <= 0)
+    {
+        // 0.[000]digits
+        // len <= 2 + (-min_exp - 1) + max_digits10
+
+        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
+        buf[0] = '0';
+        buf[1] = '.';
+        std::memset(buf + 2, '0', static_cast<size_t>(-n));
+        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
+    }
+
+    if (k == 1)
+    {
+        // dE+123
+        // len <= 1 + 5
+
+        buf += 1;
+    }
+    else
+    {
+        // d.igitsE+123
+        // len <= max_digits10 + 1 + 5
+
+        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
+        buf[1] = '.';
+        buf += 1 + static_cast<size_t>(k);
+    }
+
+    *buf++ = 'e';
+    return append_exponent(buf, n - 1);
+}
+
+} // namespace dtoa_impl
+
+/*!
+@brief generates a decimal representation of the floating-point number value in [first, last).
+
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1, 2)
+JSON_HEDLEY_RETURNS_NON_NULL
+char* to_chars(char* first, const char* last, FloatType value)
+{
+    static_cast<void>(last); // maybe unused - fix warning
+    JSON_ASSERT(std::isfinite(value));
+
+    // Use signbit(value) instead of (value < 0) since signbit works for -0.
+    if (std::signbit(value))
+    {
+        value = -value;
+        *first++ = '-';
+    }
+
+    if (value == 0) // +-0
+    {
+        *first++ = '0';
+        // Make it look like a floating-point number (#362, #378)
+        *first++ = '.';
+        *first++ = '0';
+        return first;
+    }
+
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);
+
+    // Compute v = buffer * 10^decimal_exponent.
+    // The decimal digits are stored in the buffer, which needs to be interpreted
+    // as an unsigned decimal integer.
+    // len is the length of the buffer, i.e. the number of decimal digits.
+    int len = 0;
+    int decimal_exponent = 0;
+    dtoa_impl::grisu2(first, len, decimal_exponent, value);
+
+    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);
+
+    // Format the buffer like printf("%.*g", prec, value)
+    constexpr int kMinExp = -4;
+    // Use digits10 here to increase compatibility with version 2.
+    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
+
+    JSON_ASSERT(last - first >= kMaxExp + 2);
+    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
+
+    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
+}
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// serialization //
+///////////////////
+
+/// how to treat decoding errors
+enum class error_handler_t
+{
+    strict,  ///< throw a type_error exception in case of invalid UTF-8
+    replace, ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore   ///< ignore invalid UTF-8 sequences
+};
+
+template<typename BasicJsonType>
+class serializer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using binary_char_t = typename BasicJsonType::binary_t::value_type;
+    static constexpr std::uint8_t UTF8_ACCEPT = 0;
+    static constexpr std::uint8_t UTF8_REJECT = 1;
+
+  public:
+    /*!
+    @param[in] s  output stream to serialize to
+    @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
+    */
+    serializer(output_adapter_t<char> s, const char ichar,
+               error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s))
+        , loc(std::localeconv())
+        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->thousands_sep)))
+        , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->decimal_point)))
+        , indent_char(ichar)
+        , indent_string(512, indent_char)
+        , error_handler(error_handler_)
+    {}
+
+    // delete because of pointer members
+    serializer(const serializer&) = delete;
+    serializer& operator=(const serializer&) = delete;
+    serializer(serializer&&) = delete;
+    serializer& operator=(serializer&&) = delete;
+    ~serializer() = default;
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serialization internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is
+    called recursively.
+
+    - strings and object keys are escaped using `escape_string()`
+    - integer numbers are converted implicitly via `operator<<`
+    - floating-point numbers are converted to a string using `"%g"` format
+    - binary values are serialized as objects containing the subtype and the
+      byte array
+
+    @param[in] val               value to serialize
+    @param[in] pretty_print      whether the output shall be pretty-printed
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] indent_step       the indent level
+    @param[in] current_indent    the current indent level (only used internally)
+    */
+    void dump(const BasicJsonType& val,
+              const bool pretty_print,
+              const bool ensure_ascii,
+              const unsigned int indent_step,
+              const unsigned int current_indent = 0)
+    {
+        switch (val.m_type)
+        {
+            case value_t::object:
+            {
+                if (val.m_value.object->empty())
+                {
+                    o->write_characters("{}", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\": ", 3);
+                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\": ", 3);
+                    dump(i->second, true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_character('{');
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\":", 2);
+                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\":", 2);
+                    dump(i->second, false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character('}');
+                }
+
+                return;
+            }
+
+            case value_t::array:
+            {
+                if (val.m_value.array->empty())
+                {
+                    o->write_characters("[]", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("[\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        dump(*i, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_value.array->empty());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character(']');
+                }
+                else
+                {
+                    o->write_character('[');
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        dump(*i, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_value.array->empty());
+                    dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character(']');
+                }
+
+                return;
+            }
+
+            case value_t::string:
+            {
+                o->write_character('\"');
+                dump_escaped(*val.m_value.string, ensure_ascii);
+                o->write_character('\"');
+                return;
+            }
+
+            case value_t::binary:
+            {
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"bytes\": [", 10);
+
+                    if (!val.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_value.binary->cbegin();
+                                i != val.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_characters(", ", 2);
+                        }
+                        dump_integer(val.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\n", 3);
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"subtype\": ", 11);
+                    if (val.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_value.binary->subtype());
+                    }
+                    else
+                    {
+                        o->write_characters("null", 4);
+                    }
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_characters("{\"bytes\":[", 10);
+
+                    if (!val.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_value.binary->cbegin();
+                                i != val.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_character(',');
+                        }
+                        dump_integer(val.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\"subtype\":", 12);
+                    if (val.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_value.binary->subtype());
+                        o->write_character('}');
+                    }
+                    else
+                    {
+                        o->write_characters("null}", 5);
+                    }
+                }
+                return;
+            }
+
+            case value_t::boolean:
+            {
+                if (val.m_value.boolean)
+                {
+                    o->write_characters("true", 4);
+                }
+                else
+                {
+                    o->write_characters("false", 5);
+                }
+                return;
+            }
+
+            case value_t::number_integer:
+            {
+                dump_integer(val.m_value.number_integer);
+                return;
+            }
+
+            case value_t::number_unsigned:
+            {
+                dump_integer(val.m_value.number_unsigned);
+                return;
+            }
+
+            case value_t::number_float:
+            {
+                dump_float(val.m_value.number_float);
+                return;
+            }
+
+            case value_t::discarded:
+            {
+                o->write_characters("<discarded>", 11);
+                return;
+            }
+
+            case value_t::null:
+            {
+                o->write_characters("null", 4);
+                return;
+            }
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+    }
+
+  private:
+    /*!
+    @brief dump escaped string
+
+    Escape a string by replacing certain special characters by a sequence of an
+    escape character (backslash) and another character and other control
+    characters by a sequence of "\u" followed by a four-digit hex
+    representation. The escaped string is written to output stream @a o.
+
+    @param[in] s  the string to escape
+    @param[in] ensure_ascii  whether to escape non-ASCII characters with
+                             \uXXXX sequences
+
+    @complexity Linear in the length of string @a s.
+    */
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
+    {
+        std::uint32_t codepoint;
+        std::uint8_t state = UTF8_ACCEPT;
+        std::size_t bytes = 0;  // number of bytes written to string_buffer
+
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars = 0;
+
+        for (std::size_t i = 0; i < s.size(); ++i)
+        {
+            const auto byte = static_cast<uint8_t>(s[i]);
+
+            switch (decode(state, codepoint, byte))
+            {
+                case UTF8_ACCEPT:  // decode found a new code point
+                {
+                    switch (codepoint)
+                    {
+                        case 0x08: // backspace
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'b';
+                            break;
+                        }
+
+                        case 0x09: // horizontal tab
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 't';
+                            break;
+                        }
+
+                        case 0x0A: // newline
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'n';
+                            break;
+                        }
+
+                        case 0x0C: // formfeed
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'f';
+                            break;
+                        }
+
+                        case 0x0D: // carriage return
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'r';
+                            break;
+                        }
+
+                        case 0x22: // quotation mark
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\"';
+                            break;
+                        }
+
+                        case 0x5C: // reverse solidus
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\\';
+                            break;
+                        }
+
+                        default:
+                        {
+                            // escape control characters (0x00..0x1F) or, if
+                            // ensure_ascii parameter is used, non-ASCII characters
+                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F)))
+                            {
+                                if (codepoint <= 0xFFFF)
+                                {
+                                    (std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
+                                                    static_cast<std::uint16_t>(codepoint));
+                                    bytes += 6;
+                                }
+                                else
+                                {
+                                    (std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
+                                                    static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
+                                                    static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu)));
+                                    bytes += 12;
+                                }
+                            }
+                            else
+                            {
+                                // copy byte to buffer (all previous bytes
+                                // been copied have in default case above)
+                                string_buffer[bytes++] = s[i];
+                            }
+                            break;
+                        }
+                    }
+
+                    // write buffer and reset index; there must be 13 bytes
+                    // left, as this is the maximal number of bytes to be
+                    // written ("\uxxxx\uxxxx\0") for one code point
+                    if (string_buffer.size() - bytes < 13)
+                    {
+                        o->write_characters(string_buffer.data(), bytes);
+                        bytes = 0;
+                    }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
+                    undumped_chars = 0;
+                    break;
+                }
+
+                case UTF8_REJECT:  // decode found invalid UTF-8 byte
+                {
+                    switch (error_handler)
+                    {
+                        case error_handler_t::strict:
+                        {
+                            std::string sn(3, '\0');
+                            (std::snprintf)(&sn[0], sn.size(), "%.2X", byte);
+                            JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                        }
+
+                        case error_handler_t::ignore:
+                        case error_handler_t::replace:
+                        {
+                            // in case we saw this character the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (undumped_chars > 0)
+                            {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace)
+                            {
+                                // add a replacement character
+                                if (ensure_ascii)
+                                {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                }
+                                else
+                                {
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
+                                }
+
+                                // write buffer and reset index; there must be 13 bytes
+                                // left, as this is the maximal number of bytes to be
+                                // written ("\uxxxx\uxxxx\0") for one code point
+                                if (string_buffer.size() - bytes < 13)
+                                {
+                                    o->write_characters(string_buffer.data(), bytes);
+                                    bytes = 0;
+                                }
+
+                                bytes_after_last_accept = bytes;
+                            }
+
+                            undumped_chars = 0;
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:            // LCOV_EXCL_LINE
+                            JSON_ASSERT(false);  // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+
+                default:  // decode found yet incomplete multi-byte code point
+                {
+                    if (!ensure_ascii)
+                    {
+                        // code point will not be escaped - copy byte to buffer
+                        string_buffer[bytes++] = s[i];
+                    }
+                    ++undumped_chars;
+                    break;
+                }
+            }
+        }
+
+        // we finished processing the string
+        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
+        {
+            // write buffer
+            if (bytes > 0)
+            {
+                o->write_characters(string_buffer.data(), bytes);
+            }
+        }
+        else
+        {
+            // we finish reading, but do not accept: string was incomplete
+            switch (error_handler)
+            {
+                case error_handler_t::strict:
+                {
+                    std::string sn(3, '\0');
+                    (std::snprintf)(&sn[0], sn.size(), "%.2X", static_cast<std::uint8_t>(s.back()));
+                    JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+                }
+
+                case error_handler_t::ignore:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    break;
+                }
+
+                case error_handler_t::replace:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
+                    if (ensure_ascii)
+                    {
+                        o->write_characters("\\ufffd", 6);
+                    }
+                    else
+                    {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
+                    break;
+                }
+
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);  // LCOV_EXCL_LINE
+            }
+        }
+    }
+
+    /*!
+    @brief count digits
+
+    Count the number of decimal (base 10) digits for an input unsigned integer.
+
+    @param[in] x  unsigned integer number to count its digits
+    @return    number of decimal digits
+    */
+    inline unsigned int count_digits(number_unsigned_t x) noexcept
+    {
+        unsigned int n_digits = 1;
+        for (;;)
+        {
+            if (x < 10)
+            {
+                return n_digits;
+            }
+            if (x < 100)
+            {
+                return n_digits + 1;
+            }
+            if (x < 1000)
+            {
+                return n_digits + 2;
+            }
+            if (x < 10000)
+            {
+                return n_digits + 3;
+            }
+            x = x / 10000u;
+            n_digits += 4;
+        }
+    }
+
+    /*!
+    @brief dump an integer
+
+    Dump a given integer to output stream @a o. Works internally with
+    @a number_buffer.
+
+    @param[in] x  integer number (signed or unsigned) to dump
+    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
+    */
+    template < typename NumberType, detail::enable_if_t <
+                   std::is_same<NumberType, number_unsigned_t>::value ||
+                   std::is_same<NumberType, number_integer_t>::value ||
+                   std::is_same<NumberType, binary_char_t>::value,
+                   int > = 0 >
+    void dump_integer(NumberType x)
+    {
+        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
+        {
+            {
+                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
+                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
+                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
+                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
+                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
+                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
+                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
+                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
+                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
+                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
+            }
+        };
+
+        // special case for "0"
+        if (x == 0)
+        {
+            o->write_character('0');
+            return;
+        }
+
+        // use a pointer to fill the buffer
+        auto buffer_ptr = number_buffer.begin();
+
+        const bool is_negative = std::is_same<NumberType, number_integer_t>::value && !(x >= 0); // see issue #755
+        number_unsigned_t abs_value;
+
+        unsigned int n_chars;
+
+        if (is_negative)
+        {
+            *buffer_ptr = '-';
+            abs_value = remove_sign(static_cast<number_integer_t>(x));
+
+            // account one more byte for the minus sign
+            n_chars = 1 + count_digits(abs_value);
+        }
+        else
+        {
+            abs_value = static_cast<number_unsigned_t>(x);
+            n_chars = count_digits(abs_value);
+        }
+
+        // spare 1 byte for '\0'
+        JSON_ASSERT(n_chars < number_buffer.size() - 1);
+
+        // jump to the end to generate the string from backward
+        // so we later avoid reversing the result
+        buffer_ptr += n_chars;
+
+        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
+        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
+        while (abs_value >= 100)
+        {
+            const auto digits_index = static_cast<unsigned>((abs_value % 100));
+            abs_value /= 100;
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+
+        if (abs_value >= 10)
+        {
+            const auto digits_index = static_cast<unsigned>(abs_value);
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+        else
+        {
+            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
+        }
+
+        o->write_characters(number_buffer.data(), n_chars);
+    }
+
+    /*!
+    @brief dump a floating-point number
+
+    Dump a given floating-point number to output stream @a o. Works internally
+    with @a number_buffer.
+
+    @param[in] x  floating-point number to dump
+    */
+    void dump_float(number_float_t x)
+    {
+        // NaN / inf
+        if (!std::isfinite(x))
+        {
+            o->write_characters("null", 4);
+            return;
+        }
+
+        // If number_float_t is an IEEE-754 single or double precision number,
+        // use the Grisu2 algorithm to produce short numbers which are
+        // guaranteed to round-trip, using strtof and strtod, resp.
+        //
+        // NB: The test below works if <long double> == <double>.
+        static constexpr bool is_ieee_single_or_double
+            = (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 && std::numeric_limits<number_float_t>::max_exponent == 128) ||
+              (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 && std::numeric_limits<number_float_t>::max_exponent == 1024);
+
+        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
+    }
+
+    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
+    {
+        char* begin = number_buffer.data();
+        char* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
+
+        o->write_characters(begin, static_cast<size_t>(end - begin));
+    }
+
+    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
+    {
+        // get number of digits for a float -> text -> float round-trip
+        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
+
+        // the actual conversion
+        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
+
+        // negative value indicates an error
+        JSON_ASSERT(len > 0);
+        // check if buffer was large enough
+        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());
+
+        // erase thousands separator
+        if (thousands_sep != '\0')
+        {
+            const auto end = std::remove(number_buffer.begin(),
+                                         number_buffer.begin() + len, thousands_sep);
+            std::fill(end, number_buffer.end(), '\0');
+            JSON_ASSERT((end - number_buffer.begin()) <= len);
+            len = (end - number_buffer.begin());
+        }
+
+        // convert decimal point to '.'
+        if (decimal_point != '\0' && decimal_point != '.')
+        {
+            const auto dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
+            if (dec_pos != number_buffer.end())
+            {
+                *dec_pos = '.';
+            }
+        }
+
+        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));
+
+        // determine if need to append ".0"
+        const bool value_is_int_like =
+            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
+                         [](char c)
+        {
+            return c == '.' || c == 'e';
+        });
+
+        if (value_is_int_like)
+        {
+            o->write_characters(".0", 2);
+        }
+    }
+
+    /*!
+    @brief check whether a string is UTF-8 encoded
+
+    The function checks each byte of a string whether it is UTF-8 encoded. The
+    result of the check is stored in the @a state parameter. The function must
+    be called initially with state 0 (accept). State 1 means the string must
+    be rejected, because the current byte is not allowed. If the string is
+    completely processed, but the state is non-zero, the string ended
+    prematurely; that is, the last byte indicated more bytes should have
+    followed.
+
+    @param[in,out] state  the state of the decoding
+    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
+    @param[in] byte       next byte to decode
+    @return               new state
+
+    @note The function has been edited: a std::array is used.
+
+    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    */
+    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
+    {
+        static const std::array<std::uint8_t, 400> utf8d =
+        {
+            {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
+                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
+                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
+                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
+                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
+                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
+            }
+        };
+
+        const std::uint8_t type = utf8d[byte];
+
+        codep = (state != UTF8_ACCEPT)
+                ? (byte & 0x3fu) | (codep << 6u)
+                : (0xFFu >> type) & (byte);
+
+        std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
+        JSON_ASSERT(index < 400);
+        state = utf8d[index];
+        return state;
+    }
+
+    /*
+     * Overload to make the compiler happy while it is instantiating
+     * dump_integer for number_unsigned_t.
+     * Must never be called.
+     */
+    number_unsigned_t remove_sign(number_unsigned_t x)
+    {
+        JSON_ASSERT(false); // LCOV_EXCL_LINE
+        return x; // LCOV_EXCL_LINE
+    }
+
+    /*
+     * Helper function for dump_integer
+     *
+     * This function takes a negative signed integer and returns its absolute
+     * value as unsigned integer. The plus/minus shuffling is necessary as we can
+     * not directly remove the sign of an arbitrary signed integer as the
+     * absolute values of INT_MIN and INT_MAX are usually not the same. See
+     * #1708 for details.
+     */
+    inline number_unsigned_t remove_sign(number_integer_t x) noexcept
+    {
+        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)());
+        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
+    }
+
+  private:
+    /// the output of the serializer
+    output_adapter_t<char> o = nullptr;
+
+    /// a (hopefully) large enough character buffer
+    std::array<char, 64> number_buffer{{}};
+
+    /// the locale
+    const std::lconv* loc = nullptr;
+    /// the locale's thousand separator character
+    const char thousands_sep = '\0';
+    /// the locale's decimal point character
+    const char decimal_point = '\0';
+
+    /// string buffer
+    std::array<char, 512> string_buffer{{}};
+
+    /// the indentation character
+    const char indent_char;
+    /// the indentation string
+    string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+
+// #include <nlohmann/ordered_map.hpp>
+
+
+#include <functional> // less
+#include <memory> // allocator
+#include <utility> // pair
+#include <vector> // vector
+
+namespace nlohmann
+{
+
+/// ordered_map: a minimal map-like container that preserves insertion order
+/// for use within nlohmann::basic_json<ordered_map>
+template <class Key, class T, class IgnoredLess = std::less<Key>,
+          class Allocator = std::allocator<std::pair<const Key, T>>>
+                  struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
+{
+    using key_type = Key;
+    using mapped_type = T;
+    using Container = std::vector<std::pair<const Key, T>, Allocator>;
+    using typename Container::iterator;
+    using typename Container::const_iterator;
+    using typename Container::size_type;
+    using typename Container::value_type;
+
+    // Explicit constructors instead of `using Container::Container`
+    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
+    ordered_map(const Allocator& alloc = Allocator()) : Container{alloc} {}
+    template <class It>
+    ordered_map(It first, It last, const Allocator& alloc = Allocator())
+        : Container{first, last, alloc} {}
+    ordered_map(std::initializer_list<T> init, const Allocator& alloc = Allocator() )
+        : Container{init, alloc} {}
+
+    std::pair<iterator, bool> emplace(const key_type& key, T&& t)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return {it, false};
+            }
+        }
+        Container::emplace_back(key, t);
+        return {--this->end(), true};
+    }
+
+    T& operator[](const Key& key)
+    {
+        return emplace(key, T{}).first->second;
+    }
+
+    const T& operator[](const Key& key) const
+    {
+        return at(key);
+    }
+
+    T& at(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it->second;
+            }
+        }
+
+        throw std::out_of_range("key not found");
+    }
+
+    const T& at(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it->second;
+            }
+        }
+
+        throw std::out_of_range("key not found");
+    }
+
+    size_type erase(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                // Since we cannot move const Keys, re-construct them in place
+                for (auto next = it; ++next != this->end(); ++it)
+                {
+                    it->~value_type(); // Destroy but keep allocation
+                    new (&*it) value_type{std::move(*next)};
+                }
+                Container::pop_back();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator erase(iterator pos)
+    {
+        auto it = pos;
+
+        // Since we cannot move const Keys, re-construct them in place
+        for (auto next = it; ++next != this->end(); ++it)
+        {
+            it->~value_type(); // Destroy but keep allocation
+            new (&*it) value_type{std::move(*next)};
+        }
+        Container::pop_back();
+        return pos;
+    }
+
+    size_type count(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator find(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    const_iterator find(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value )
+    {
+        return emplace(value.first, std::move(value.second));
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value )
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == value.first)
+            {
+                return {it, false};
+            }
+        }
+        Container::push_back(value);
+        return {--this->end(), true};
+    }
+};
+
+}  // namespace nlohmann
+
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+
+/*!
+@brief a class to store JSON values
+
+@tparam ObjectType type for JSON objects (`std::map` by default; will be used
+in @ref object_t)
+@tparam ArrayType type for JSON arrays (`std::vector` by default; will be used
+in @ref array_t)
+@tparam StringType type for JSON strings and object keys (`std::string` by
+default; will be used in @ref string_t)
+@tparam BooleanType type for JSON booleans (`bool` by default; will be used
+in @ref boolean_t)
+@tparam NumberIntegerType type for JSON integer numbers (`int64_t` by
+default; will be used in @ref number_integer_t)
+@tparam NumberUnsignedType type for JSON unsigned integer numbers (@c
+`uint64_t` by default; will be used in @ref number_unsigned_t)
+@tparam NumberFloatType type for JSON floating-point numbers (`double` by
+default; will be used in @ref number_float_t)
+@tparam BinaryType type for packed binary data for compatibility with binary
+serialization formats (`std::vector<std::uint8_t>` by default; will be used in
+@ref binary_t)
+@tparam AllocatorType type of the allocator to use (`std::allocator` by
+default)
+@tparam JSONSerializer the serializer to resolve internal calls to `to_json()`
+and `from_json()` (@ref adl_serializer by default)
+
+@requirement The class satisfies the following concept requirements:
+- Basic
+ - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible):
+   JSON values can be default constructed. The result will be a JSON null
+   value.
+ - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible):
+   A JSON value can be constructed from an rvalue argument.
+ - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible):
+   A JSON value can be copy-constructed from an lvalue expression.
+ - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable):
+   A JSON value van be assigned from an rvalue argument.
+ - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable):
+   A JSON value can be copy-assigned from an lvalue expression.
+ - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible):
+   JSON values can be destructed.
+- Layout
+ - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType):
+   JSON values have
+   [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
+   All non-static data members are private and standard layout types, the
+   class has no virtual functions or (virtual) base classes.
+- Library-wide
+ - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable):
+   JSON values can be compared with `==`, see @ref
+   operator==(const_reference,const_reference).
+ - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable):
+   JSON values can be compared with `<`, see @ref
+   operator<(const_reference,const_reference).
+ - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable):
+   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
+   other compatible types, using unqualified function call @ref swap().
+ - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer):
+   JSON values can be compared against `std::nullptr_t` objects which are used
+   to model the `null` value.
+- Container
+ - [Container](https://en.cppreference.com/w/cpp/named_req/Container):
+   JSON values can be used like STL containers and provide iterator access.
+ - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer);
+   JSON values can be used like STL containers and provide reverse iterator
+   access.
+
+@invariant The member variables @a m_value and @a m_type have the following
+relationship:
+- If `m_type == value_t::object`, then `m_value.object != nullptr`.
+- If `m_type == value_t::array`, then `m_value.array != nullptr`.
+- If `m_type == value_t::string`, then `m_value.string != nullptr`.
+The invariants are checked by member function assert_invariant().
+
+@internal
+@note ObjectType trick from https://stackoverflow.com/a/9860911
+@endinternal
+
+@see [RFC 7159: The JavaScript Object Notation (JSON) Data Interchange
+Format](http://rfc7159.net/rfc7159)
+
+@since version 1.0.0
+
+@nosubgrouping
+*/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+class basic_json
+{
+  private:
+    template<detail::value_t> friend struct detail::external_constructor;
+    friend ::nlohmann::json_pointer<basic_json>;
+
+    template<typename BasicJsonType, typename InputType>
+    friend class ::nlohmann::detail::parser;
+    friend ::nlohmann::detail::serializer<basic_json>;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::iter_impl;
+    template<typename BasicJsonType, typename CharType>
+    friend class ::nlohmann::detail::binary_writer;
+    template<typename BasicJsonType, typename InputType, typename SAX>
+    friend class ::nlohmann::detail::binary_reader;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_parser;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
+
+    /// workaround type for MSVC
+    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
+
+    // convenience aliases for types residing in namespace detail;
+    using lexer = ::nlohmann::detail::lexer_base<basic_json>;
+
+    template<typename InputAdapterType>
+    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
+        InputAdapterType adapter,
+        detail::parser_callback_t<basic_json>cb = nullptr,
+        const bool allow_exceptions = true,
+        const bool ignore_comments = false
+                                 )
+    {
+        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
+                std::move(cb), allow_exceptions, ignore_comments);
+    }
+
+    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
+    template<typename BasicJsonType>
+    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
+    template<typename BasicJsonType>
+    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
+    template<typename Iterator>
+    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
+    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
+
+    template<typename CharType>
+    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
+
+    template<typename InputType>
+    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
+    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
+
+    using serializer = ::nlohmann::detail::serializer<basic_json>;
+
+  public:
+    using value_t = detail::value_t;
+    /// JSON Pointer, see @ref nlohmann::json_pointer
+    using json_pointer = ::nlohmann::json_pointer<basic_json>;
+    template<typename T, typename SFINAE>
+    using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
+    /// how to treat CBOR tags
+    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
+    /// helper type for initializer lists of basic_json values
+    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
+
+    using input_format_t = detail::input_format_t;
+    /// SAX interface type, see @ref nlohmann::json_sax
+    using json_sax_t = json_sax<basic_json>;
+
+    ////////////////
+    // exceptions //
+    ////////////////
+
+    /// @name exceptions
+    /// Classes to implement user-defined exceptions.
+    /// @{
+
+    /// @copydoc detail::exception
+    using exception = detail::exception;
+    /// @copydoc detail::parse_error
+    using parse_error = detail::parse_error;
+    /// @copydoc detail::invalid_iterator
+    using invalid_iterator = detail::invalid_iterator;
+    /// @copydoc detail::type_error
+    using type_error = detail::type_error;
+    /// @copydoc detail::out_of_range
+    using out_of_range = detail::out_of_range;
+    /// @copydoc detail::other_error
+    using other_error = detail::other_error;
+
+    /// @}
+
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// The canonic container types to use @ref basic_json like any other STL
+    /// container.
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    /// an iterator for a basic_json container
+    using iterator = iter_impl<basic_json>;
+    /// a const iterator for a basic_json container
+    using const_iterator = iter_impl<const basic_json>;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+
+    /*!
+    @brief returns the allocator associated with the container
+    */
+    static allocator_type get_allocator()
+    {
+        return allocator_type();
+    }
+
+    /*!
+    @brief returns version information on the library
+
+    This function returns a JSON object with information about the library,
+    including the version number and information on the platform and compiler.
+
+    @return JSON object holding version information
+    key         | description
+    ----------- | ---------------
+    `compiler`  | Information on the used compiler. It is an object with the following keys: `c++` (the used C++ standard), `family` (the compiler family; possible values are `clang`, `icc`, `gcc`, `ilecpp`, `msvc`, `pgcpp`, `sunpro`, and `unknown`), and `version` (the compiler version).
+    `copyright` | The copyright line for the library as string.
+    `name`      | The name of the library as string.
+    `platform`  | The used platform as string. Possible values are `win32`, `linux`, `apple`, `unix`, and `unknown`.
+    `url`       | The URL of the project as string.
+    `version`   | The version of the library. It is an object with the following keys: `major`, `minor`, and `patch` as defined by [Semantic Versioning](http://semver.org), and `string` (the version string).
+
+    @liveexample{The following code shows an example output of the `meta()`
+    function.,meta}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @complexity Constant.
+
+    @since 2.1.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json meta()
+    {
+        basic_json result;
+
+        result["copyright"] = "(C) 2013-2020 Niels Lohmann";
+        result["name"] = "JSON for Modern C++";
+        result["url"] = "https://github.com/nlohmann/json";
+        result["version"]["string"] =
+            std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_PATCH);
+        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
+        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
+        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
+
+#ifdef _WIN32
+        result["platform"] = "win32";
+#elif defined __linux__
+        result["platform"] = "linux";
+#elif defined __APPLE__
+        result["platform"] = "apple";
+#elif defined __unix__
+        result["platform"] = "unix";
+#else
+        result["platform"] = "unknown";
+#endif
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
+#elif defined(__clang__)
+        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
+#elif defined(__GNUC__) || defined(__GNUG__)
+        result["compiler"] = {{"family", "gcc"}, {"version", std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__)}};
+#elif defined(__HP_cc) || defined(__HP_aCC)
+        result["compiler"] = "hp"
+#elif defined(__IBMCPP__)
+        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
+#elif defined(_MSC_VER)
+        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
+#elif defined(__PGI)
+        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
+#elif defined(__SUNPRO_CC)
+        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
+#else
+        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
+#endif
+
+#ifdef __cplusplus
+        result["compiler"]["c++"] = std::to_string(__cplusplus);
+#else
+        result["compiler"]["c++"] = "unknown";
+#endif
+        return result;
+    }
+
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// The data types to store a JSON value. These types are derived from
+    /// the template arguments passed to class @ref basic_json.
+    /// @{
+
+#if defined(JSON_HAS_CPP_14)
+    // Use transparent comparator if possible, combined with perfect forwarding
+    // on find() and count() calls prevents unnecessary string construction.
+    using object_comparator_t = std::less<>;
+#else
+    using object_comparator_t = std::less<StringType>;
+#endif
+
+    /*!
+    @brief a type for an object
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON objects as follows:
+    > An object is an unordered collection of zero or more name/value pairs,
+    > where a name is a string and a value is a string, number, boolean, null,
+    > object, or array.
+
+    To store objects in C++, a type is defined by the template parameters
+    described below.
+
+    @tparam ObjectType  the container to store objects (e.g., `std::map` or
+    `std::unordered_map`)
+    @tparam StringType the type of the keys or names (e.g., `std::string`).
+    The comparison function `std::less<StringType>` is used to order elements
+    inside the container.
+    @tparam AllocatorType the allocator to use for objects (e.g.,
+    `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ObjectType (`std::map`), @a StringType
+    (`std::string`), and @a AllocatorType (`std::allocator`), the default
+    value for @a object_t is:
+
+    @code {.cpp}
+    std::map<
+      std::string, // key_type
+      basic_json, // value_type
+      std::less<std::string>, // key_compare
+      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
+    >
+    @endcode
+
+    #### Behavior
+
+    The choice of @a object_t influences the behavior of the JSON class. With
+    the default type, objects have the following behavior:
+
+    - When all names are unique, objects will be interoperable in the sense
+      that all software implementations receiving that object will agree on
+      the name-value mappings.
+    - When the names within an object are not unique, it is unspecified which
+      one of the values for a given key will be chosen. For instance,
+      `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or
+      `{"key": 2}`.
+    - Internally, name/value pairs are stored in lexicographical order of the
+      names. Objects will also be serialized (see @ref dump) in this order.
+      For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored
+      and serialized as `{"a": 2, "b": 1}`.
+    - When comparing objects, the order of the name/value pairs is irrelevant.
+      This makes objects interoperable in the sense that they will not be
+      affected by these differences. For instance, `{"b": 1, "a": 2}` and
+      `{"a": 2, "b": 1}` will be treated as equal.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the object's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON object.
+
+    #### Storage
+
+    Objects are stored as pointers in a @ref basic_json type. That is, for any
+    access to object values, a pointer of type `object_t*` must be
+    dereferenced.
+
+    @sa @ref array_t -- type for an array value
+
+    @since version 1.0.0
+
+    @note The order name/value pairs are added to the object is *not*
+    preserved by the library. Therefore, iterating an object may return
+    name/value pairs in a different order than they were originally stored. In
+    fact, keys will be traversed in alphabetical order as `std::map` with
+    `std::less` is used by default. Please note this behavior conforms to [RFC
+    7159](http://rfc7159.net/rfc7159), because any order implements the
+    specified "unordered" nature of JSON objects.
+    */
+    using object_t = ObjectType<StringType,
+          basic_json,
+          object_comparator_t,
+          AllocatorType<std::pair<const StringType,
+          basic_json>>>;
+
+    /*!
+    @brief a type for an array
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON arrays as follows:
+    > An array is an ordered sequence of zero or more values.
+
+    To store objects in C++, a type is defined by the template parameters
+    explained below.
+
+    @tparam ArrayType  container type to store arrays (e.g., `std::vector` or
+    `std::list`)
+    @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ArrayType (`std::vector`) and @a
+    AllocatorType (`std::allocator`), the default value for @a array_t is:
+
+    @code {.cpp}
+    std::vector<
+      basic_json, // value_type
+      std::allocator<basic_json> // allocator_type
+    >
+    @endcode
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the array's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON array.
+
+    #### Storage
+
+    Arrays are stored as pointers in a @ref basic_json type. That is, for any
+    access to array values, a pointer of type `array_t*` must be dereferenced.
+
+    @sa @ref object_t -- type for an object value
+
+    @since version 1.0.0
+    */
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /*!
+    @brief a type for a string
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON strings as follows:
+    > A string is a sequence of zero or more Unicode characters.
+
+    To store objects in C++, a type is defined by the template parameter
+    described below. Unicode values are split by the JSON class into
+    byte-sized characters during deserialization.
+
+    @tparam StringType  the container to store strings (e.g., `std::string`).
+    Note this container is used for keys/names in objects, see @ref object_t.
+
+    #### Default type
+
+    With the default values for @a StringType (`std::string`), the default
+    value for @a string_t is:
+
+    @code {.cpp}
+    std::string
+    @endcode
+
+    #### Encoding
+
+    Strings are stored in UTF-8 encoding. Therefore, functions like
+    `std::string::size()` or `std::string::length()` return the number of
+    bytes in the string rather than the number of characters or glyphs.
+
+    #### String comparison
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > Software implementations are typically required to test names of object
+    > members for equality. Implementations that transform the textual
+    > representation into sequences of Unicode code units and then perform the
+    > comparison numerically, code unit by code unit, are interoperable in the
+    > sense that implementations will agree in all cases on equality or
+    > inequality of two strings. For example, implementations that compare
+    > strings with escaped characters unconverted may incorrectly find that
+    > `"a\\b"` and `"a\u005Cb"` are not equal.
+
+    This implementation is interoperable as it does compare strings code unit
+    by code unit.
+
+    #### Storage
+
+    String values are stored as pointers in a @ref basic_json type. That is,
+    for any access to string values, a pointer of type `string_t*` must be
+    dereferenced.
+
+    @since version 1.0.0
+    */
+    using string_t = StringType;
+
+    /*!
+    @brief a type for a boolean
+
+    [RFC 7159](http://rfc7159.net/rfc7159) implicitly describes a boolean as a
+    type which differentiates the two literals `true` and `false`.
+
+    To store objects in C++, a type is defined by the template parameter @a
+    BooleanType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a BooleanType (`bool`), the default value for
+    @a boolean_t is:
+
+    @code {.cpp}
+    bool
+    @endcode
+
+    #### Storage
+
+    Boolean values are stored directly inside a @ref basic_json type.
+
+    @since version 1.0.0
+    */
+    using boolean_t = BooleanType;
+
+    /*!
+    @brief a type for a number (integer)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store integer numbers in C++, a type is defined by the template
+    parameter @a NumberIntegerType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberIntegerType (`int64_t`), the default
+    value for @a number_integer_t is:
+
+    @code {.cpp}
+    int64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
+    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
+    that are out of range will yield over/underflow when used in a
+    constructor. During deserialization, too large or small integer numbers
+    will be automatically be stored as @ref number_unsigned_t or @ref
+    number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange of the exactly supported range [INT64_MIN,
+    INT64_MAX], this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa @ref number_float_t -- type for number values (floating-point)
+
+    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_integer_t = NumberIntegerType;
+
+    /*!
+    @brief a type for a number (unsigned)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store unsigned integer numbers in C++, a type is defined by the
+    template parameter @a NumberUnsignedType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberUnsignedType (`uint64_t`), the
+    default value for @a number_unsigned_t is:
+
+    @code {.cpp}
+    uint64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `18446744073709551615` (UINT64_MAX) and the minimal integer
+    number that can be stored is `0`. Integer numbers that are out of range
+    will yield over/underflow when used in a constructor. During
+    deserialization, too large or small integer numbers will be automatically
+    be stored as @ref number_integer_t or @ref number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange (when considered in conjunction with the
+    number_integer_t type) of the exactly supported range [0, UINT64_MAX],
+    this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa @ref number_float_t -- type for number values (floating-point)
+    @sa @ref number_integer_t -- type for number values (integer)
+
+    @since version 2.0.0
+    */
+    using number_unsigned_t = NumberUnsignedType;
+
+    /*!
+    @brief a type for a number (floating-point)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store floating-point numbers in C++, a type is defined by the template
+    parameter @a NumberFloatType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberFloatType (`double`), the default
+    value for @a number_float_t is:
+
+    @code {.cpp}
+    double
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in floating-point literals will be ignored. Internally,
+      the value will be stored as decimal number. For instance, the C++
+      floating-point literal `01.2` will be serialized to `1.2`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > This specification allows implementations to set limits on the range and
+    > precision of numbers accepted. Since software that implements IEEE
+    > 754-2008 binary64 (double precision) numbers is generally available and
+    > widely used, good interoperability can be achieved by implementations
+    > that expect no more precision or range than these provide, in the sense
+    > that implementations will approximate JSON numbers within the expected
+    > precision.
+
+    This implementation does exactly follow this approach, as it uses double
+    precision floating-point numbers. Note values smaller than
+    `-1.79769313486232e+308` and values greater than `1.79769313486232e+308`
+    will be stored as NaN internally and be serialized to `null`.
+
+    #### Storage
+
+    Floating-point number values are stored directly inside a @ref basic_json
+    type.
+
+    @sa @ref number_integer_t -- type for number values (integer)
+
+    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_float_t = NumberFloatType;
+
+    /*!
+    @brief a type for a packed binary type
+
+    This type is a type designed to carry binary data that appears in various
+    serialized formats, such as CBOR's Major Type 2, MessagePack's bin, and
+    BSON's generic binary subtype. This type is NOT a part of standard JSON and
+    exists solely for compatibility with these binary types. As such, it is
+    simply defined as an ordered sequence of zero or more byte values.
+
+    Additionally, as an implementation detail, the subtype of the binary data is
+    carried around as a `std::uint8_t`, which is compatible with both of the
+    binary data formats that use binary subtyping, (though the specific
+    numbering is incompatible with each other, and it is up to the user to
+    translate between them).
+
+    [CBOR's RFC 7049](https://tools.ietf.org/html/rfc7049) describes this type
+    as:
+    > Major type 2: a byte string. The string's length in bytes is represented
+    > following the rules for positive integers (major type 0).
+
+    [MessagePack's documentation on the bin type
+    family](https://github.com/msgpack/msgpack/blob/master/spec.md#bin-format-family)
+    describes this type as:
+    > Bin format family stores an byte array in 2, 3, or 5 bytes of extra bytes
+    > in addition to the size of the byte array.
+
+    [BSON's specifications](http://bsonspec.org/spec.html) describe several
+    binary types; however, this type is intended to represent the generic binary
+    type which has the description:
+    > Generic binary subtype - This is the most commonly used binary subtype and
+    > should be the 'default' for drivers and tools.
+
+    None of these impose any limitations on the internal representation other
+    than the basic unit of storage be some type of array whose parts are
+    decomposable into bytes.
+
+    The default representation of this binary format is a
+    `std::vector<std::uint8_t>`, which is a very common way to represent a byte
+    array in modern C++.
+
+    #### Default type
+
+    The default values for @a BinaryType is `std::vector<std::uint8_t>`
+
+    #### Storage
+
+    Binary Arrays are stored as pointers in a @ref basic_json type. That is,
+    for any access to array values, a pointer of the type `binary_t*` must be
+    dereferenced.
+
+    #### Notes on subtypes
+
+    - CBOR
+       - Binary values are represented as byte strings. No subtypes are
+         supported and will be ignored when CBOR is written.
+    - MessagePack
+       - If a subtype is given and the binary array contains exactly 1, 2, 4, 8,
+         or 16 elements, the fixext family (fixext1, fixext2, fixext4, fixext8)
+         is used. For other sizes, the ext family (ext8, ext16, ext32) is used.
+         The subtype is then added as singed 8-bit integer.
+       - If no subtype is given, the bin family (bin8, bin16, bin32) is used.
+    - BSON
+       - If a subtype is given, it is used and added as unsigned 8-bit integer.
+       - If no subtype is given, the generic binary subtype 0x00 is used.
+
+    @sa @ref binary -- create a binary array
+
+    @since version 3.8.0
+    */
+    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
+    /// @}
+
+  private:
+
+    /// helper for exception-safe object creation
+    template<typename T, typename... Args>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    static T* create(Args&& ... args)
+    {
+        AllocatorType<T> alloc;
+        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
+
+        auto deleter = [&](T * object)
+        {
+            AllocatorTraits::deallocate(alloc, object, 1);
+        };
+        std::unique_ptr<T, decltype(deleter)> object(AllocatorTraits::allocate(alloc, 1), deleter);
+        AllocatorTraits::construct(alloc, object.get(), std::forward<Args>(args)...);
+        JSON_ASSERT(object != nullptr);
+        return object.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+    /*!
+    @brief a JSON value
+
+    The actual storage for a JSON value of the @ref basic_json class. This
+    union combines the different storage types for the JSON value types
+    defined in @ref value_t.
+
+    JSON type | value_t type    | used type
+    --------- | --------------- | ------------------------
+    object    | object          | pointer to @ref object_t
+    array     | array           | pointer to @ref array_t
+    string    | string          | pointer to @ref string_t
+    boolean   | boolean         | @ref boolean_t
+    number    | number_integer  | @ref number_integer_t
+    number    | number_unsigned | @ref number_unsigned_t
+    number    | number_float    | @ref number_float_t
+    binary    | binary          | pointer to @ref binary_t
+    null      | null            | *no value is stored*
+
+    @note Variable-length types (objects, arrays, and strings) are stored as
+    pointers. The size of the union should not exceed 64 bits if the default
+    value types are used.
+
+    @since version 1.0.0
+    */
+    union json_value
+    {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// binary (stored with pointer to save storage)
+        binary_t* binary;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (unsigned integer)
+        number_unsigned_t number_unsigned;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (unsigned)
+        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t)
+        {
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    binary = create<binary_t>();
+                    break;
+                }
+
+                case value_t::boolean:
+                {
+                    boolean = boolean_t(false);
+                    break;
+                }
+
+                case value_t::number_integer:
+                {
+                    number_integer = number_integer_t(0);
+                    break;
+                }
+
+                case value_t::number_unsigned:
+                {
+                    number_unsigned = number_unsigned_t(0);
+                    break;
+                }
+
+                case value_t::number_float:
+                {
+                    number_float = number_float_t(0.0);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    break;
+                }
+
+                default:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
+                    {
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.9.1")); // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value)
+        {
+            string = create<string_t>(value);
+        }
+
+        /// constructor for rvalue strings
+        json_value(string_t&& value)
+        {
+            string = create<string_t>(std::move(value));
+        }
+
+        /// constructor for objects
+        json_value(const object_t& value)
+        {
+            object = create<object_t>(value);
+        }
+
+        /// constructor for rvalue objects
+        json_value(object_t&& value)
+        {
+            object = create<object_t>(std::move(value));
+        }
+
+        /// constructor for arrays
+        json_value(const array_t& value)
+        {
+            array = create<array_t>(value);
+        }
+
+        /// constructor for rvalue arrays
+        json_value(array_t&& value)
+        {
+            array = create<array_t>(std::move(value));
+        }
+
+        /// constructor for binary arrays
+        json_value(const typename binary_t::container_type& value)
+        {
+            binary = create<binary_t>(value);
+        }
+
+        /// constructor for rvalue binary arrays
+        json_value(typename binary_t::container_type&& value)
+        {
+            binary = create<binary_t>(std::move(value));
+        }
+
+        /// constructor for binary arrays (internal type)
+        json_value(const binary_t& value)
+        {
+            binary = create<binary_t>(value);
+        }
+
+        /// constructor for rvalue binary arrays (internal type)
+        json_value(binary_t&& value)
+        {
+            binary = create<binary_t>(std::move(value));
+        }
+
+        void destroy(value_t t) noexcept
+        {
+            // flatten the current json_value to a heap-allocated stack
+            std::vector<basic_json> stack;
+
+            // move the top-level items to stack
+            if (t == value_t::array)
+            {
+                stack.reserve(array->size());
+                std::move(array->begin(), array->end(), std::back_inserter(stack));
+            }
+            else if (t == value_t::object)
+            {
+                stack.reserve(object->size());
+                for (auto&& it : *object)
+                {
+                    stack.push_back(std::move(it.second));
+                }
+            }
+
+            while (!stack.empty())
+            {
+                // move the last item to local variable to be processed
+                basic_json current_item(std::move(stack.back()));
+                stack.pop_back();
+
+                // if current_item is array/object, move
+                // its children to the stack to be processed later
+                if (current_item.is_array())
+                {
+                    std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(),
+                              std::back_inserter(stack));
+
+                    current_item.m_value.array->clear();
+                }
+                else if (current_item.is_object())
+                {
+                    for (auto&& it : *current_item.m_value.object)
+                    {
+                        stack.push_back(std::move(it.second));
+                    }
+
+                    current_item.m_value.object->clear();
+                }
+
+                // it's now safe that current_item get destructed
+                // since it doesn't have any children
+            }
+
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    AllocatorType<object_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    AllocatorType<array_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
+                    break;
+                }
+
+                default:
+                {
+                    break;
+                }
+            }
+        }
+    };
+
+    /*!
+    @brief checks the class invariants
+
+    This function asserts the class invariants. It needs to be called at the
+    end of every constructor to make sure that created objects respect the
+    invariant. Furthermore, it has to be called each time the type of a JSON
+    value is changed, because the invariant expresses a relationship between
+    @a m_type and @a m_value.
+    */
+    void assert_invariant() const noexcept
+    {
+        JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr);
+        JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr);
+        JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr);
+        JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr);
+    }
+
+  public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /*!
+    @brief parser event types
+
+    The parser callback distinguishes the following events:
+    - `object_start`: the parser read `{` and started to process a JSON object
+    - `key`: the parser read a key of a value in an object
+    - `object_end`: the parser read `}` and finished processing a JSON object
+    - `array_start`: the parser read `[` and started to process a JSON array
+    - `array_end`: the parser read `]` and finished processing a JSON array
+    - `value`: the parser finished reading a JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    @sa @ref parser_callback_t for more information and examples
+    */
+    using parse_event_t = detail::parse_event_t;
+
+    /*!
+    @brief per-element parser callback type
+
+    With a parser callback function, the result of parsing a JSON text can be
+    influenced. When passed to @ref parse, it is called on certain events
+    (passed as @ref parse_event_t via parameter @a event) with a set recursion
+    depth @a depth and context JSON value @a parsed. The return value of the
+    callback function is a boolean indicating whether the element that emitted
+    the callback shall be kept or not.
+
+    We distinguish six scenarios (determined by the event type) in which the
+    callback function can be called. The following table describes the values
+    of the parameters @a depth, @a event, and @a parsed.
+
+    parameter @a event | description | parameter @a depth | parameter @a parsed
+    ------------------ | ----------- | ------------------ | -------------------
+    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
+    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
+    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
+    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
+    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
+    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    Discarding a value (i.e., returning `false`) has different effects
+    depending on the context in which function was called:
+
+    - Discarded values in structured types are skipped. That is, the parser
+      will behave as if the discarded value was never read.
+    - In case a value outside a structured type is skipped, it is replaced
+      with `null`. This case happens if the top-level element is skipped.
+
+    @param[in] depth  the depth of the recursion during parsing
+
+    @param[in] event  an event of type parse_event_t indicating the context in
+    the callback function has been called
+
+    @param[in,out] parsed  the current intermediate parse result; note that
+    writing to this value has no effect for parse_event_t::key events
+
+    @return Whether the JSON value which called the function during parsing
+    should be kept (`true`) or not (`false`). In the latter case, it is either
+    skipped completely or replaced by an empty discarded object.
+
+    @sa @ref parse for examples
+
+    @since version 1.0.0
+    */
+    using parser_callback_t = detail::parser_callback_t<basic_json>;
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /// @name constructors and destructors
+    /// Constructors of class @ref basic_json, copy/move constructor, copy
+    /// assignment, static functions creating objects, and the destructor.
+    /// @{
+
+    /*!
+    @brief create an empty value with a given type
+
+    Create an empty JSON value with a given type. The value will be default
+    initialized with an empty value which depends on the type:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+    binary      | empty array
+
+    @param[in] v  the type of the value to create
+
+    @complexity Constant.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows the constructor for different @ref
+    value_t values,basic_json__value_t}
+
+    @sa @ref clear() -- restores the postcondition of this constructor
+
+    @since version 1.0.0
+    */
+    basic_json(const value_t v)
+        : m_type(v), m_value(v)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a null object
+
+    Create a `null` JSON value. It either takes a null pointer as parameter
+    (explicitly creating `null`) or no parameter (implicitly creating `null`).
+    The passed null pointer itself is not read -- it is only used to choose
+    the right constructor.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @liveexample{The following code shows the constructor with and without a
+    null pointer parameter.,basic_json__nullptr_t}
+
+    @since version 1.0.0
+    */
+    basic_json(std::nullptr_t = nullptr) noexcept
+        : basic_json(value_t::null)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value
+
+    This is a "catch all" constructor for all compatible JSON types; that is,
+    types for which a `to_json()` method exists. The constructor forwards the
+    parameter @a val to that method (to `json_serializer<U>::to_json` method
+    with `U = uncvref_t<CompatibleType>`, to be exact).
+
+    Template type @a CompatibleType includes, but is not limited to, the
+    following types:
+    - **arrays**: @ref array_t and all kinds of compatible containers such as
+      `std::vector`, `std::deque`, `std::list`, `std::forward_list`,
+      `std::array`, `std::valarray`, `std::set`, `std::unordered_set`,
+      `std::multiset`, and `std::unordered_multiset` with a `value_type` from
+      which a @ref basic_json value can be constructed.
+    - **objects**: @ref object_t and all kinds of compatible associative
+      containers such as `std::map`, `std::unordered_map`, `std::multimap`,
+      and `std::unordered_multimap` with a `key_type` compatible to
+      @ref string_t and a `value_type` from which a @ref basic_json value can
+      be constructed.
+    - **strings**: @ref string_t, string literals, and all compatible string
+      containers can be used.
+    - **numbers**: @ref number_integer_t, @ref number_unsigned_t,
+      @ref number_float_t, and all convertible number types such as `int`,
+      `size_t`, `int64_t`, `float` or `double` can be used.
+    - **boolean**: @ref boolean_t / `bool` can be used.
+    - **binary**: @ref binary_t / `std::vector<uint8_t>` may be used,
+      unfortunately because string literals cannot be distinguished from binary
+      character arrays by the C++ type system, all types compatible with `const
+      char*` will be directed to the string constructor instead.  This is both
+      for backwards compatibility, and due to the fact that a binary type is not
+      a standard JSON type.
+
+    See the examples below.
+
+    @tparam CompatibleType a type such that:
+    - @a CompatibleType is not derived from `std::istream`,
+    - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move
+         constructors),
+    - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments)
+    - @a CompatibleType is not a @ref basic_json nested type (e.g.,
+         @ref json_pointer, @ref iterator, etc ...)
+    - @ref @ref json_serializer<U> has a
+         `to_json(basic_json_t&, CompatibleType&&)` method
+
+    @tparam U = `uncvref_t<CompatibleType>`
+
+    @param[in] val the value to be forwarded to the respective constructor
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @liveexample{The following code shows the constructor with several
+    compatible types.,basic_json__CompatibleType}
+
+    @since version 2.1.0
+    */
+    template < typename CompatibleType,
+               typename U = detail::uncvref_t<CompatibleType>,
+               detail::enable_if_t <
+                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
+    basic_json(CompatibleType && val) noexcept(noexcept(
+                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
+                                           std::forward<CompatibleType>(val))))
+    {
+        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value from an existing one
+
+    This is a constructor for existing @ref basic_json types.
+    It does not hijack copy/move constructors, since the parameter has different
+    template arguments than the current ones.
+
+    The constructor tries to convert the internal @ref m_value of the parameter.
+
+    @tparam BasicJsonType a type such that:
+    - @a BasicJsonType is a @ref basic_json type.
+    - @a BasicJsonType has different template arguments than @ref basic_json_t.
+
+    @param[in] val the @ref basic_json value to be converted.
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @since version 3.2.0
+    */
+    template < typename BasicJsonType,
+               detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
+    basic_json(const BasicJsonType& val)
+    {
+        using other_boolean_t = typename BasicJsonType::boolean_t;
+        using other_number_float_t = typename BasicJsonType::number_float_t;
+        using other_number_integer_t = typename BasicJsonType::number_integer_t;
+        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+        using other_string_t = typename BasicJsonType::string_t;
+        using other_object_t = typename BasicJsonType::object_t;
+        using other_array_t = typename BasicJsonType::array_t;
+        using other_binary_t = typename BasicJsonType::binary_t;
+
+        switch (val.type())
+        {
+            case value_t::boolean:
+                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
+                break;
+            case value_t::number_float:
+                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
+                break;
+            case value_t::number_integer:
+                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
+                break;
+            case value_t::number_unsigned:
+                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
+                break;
+            case value_t::string:
+                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
+                break;
+            case value_t::object:
+                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
+                break;
+            case value_t::array:
+                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
+                break;
+            case value_t::binary:
+                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
+                break;
+            case value_t::null:
+                *this = nullptr;
+                break;
+            case value_t::discarded:
+                m_type = value_t::discarded;
+                break;
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a container (array or object) from an initializer list
+
+    Creates a JSON value of type array or object from the passed initializer
+    list @a init. In case @a type_deduction is `true` (default), the type of
+    the JSON value to be created is deducted from the initializer list @a init
+    according to the following rules:
+
+    1. If the list is empty, an empty JSON object value `{}` is created.
+    2. If the list consists of pairs whose first element is a string, a JSON
+       object value is created where the first elements of the pairs are
+       treated as keys and the second elements are as values.
+    3. In all other cases, an array is created.
+
+    The rules aim to create the best fit between a C++ initializer list and
+    JSON values. The rationale is as follows:
+
+    1. The empty initializer list is written as `{}` which is exactly an empty
+       JSON object.
+    2. C++ has no way of describing mapped types other than to list a list of
+       pairs. As JSON requires that keys must be of type string, rule 2 is the
+       weakest constraint one can pose on initializer lists to interpret them
+       as an object.
+    3. In all other cases, the initializer list could not be interpreted as
+       JSON object type, so interpreting it as JSON array type is safe.
+
+    With the rules described above, the following JSON values cannot be
+    expressed by an initializer list:
+
+    - the empty array (`[]`): use @ref array(initializer_list_t)
+      with an empty initializer list in this case
+    - arrays whose elements satisfy rule 2: use @ref
+      array(initializer_list_t) with the same initializer list
+      in this case
+
+    @note When used without parentheses around an empty initializer list, @ref
+    basic_json() is called instead of this function, yielding the JSON null
+    value.
+
+    @param[in] init  initializer list with JSON values
+
+    @param[in] type_deduction internal parameter; when set to `true`, the type
+    of the JSON value is deducted from the initializer list @a init; when set
+    to `false`, the type provided via @a manual_type is forced. This mode is
+    used by the functions @ref array(initializer_list_t) and
+    @ref object(initializer_list_t).
+
+    @param[in] manual_type internal parameter; when @a type_deduction is set
+    to `false`, the created JSON value will use the provided type (only @ref
+    value_t::array and @ref value_t::object are valid); when @a type_deduction
+    is set to `true`, this parameter has no effect
+
+    @throw type_error.301 if @a type_deduction is `false`, @a manual_type is
+    `value_t::object`, but @a init contains an element which is not a pair
+    whose first element is a string. In this case, the constructor could not
+    create an object. If @a type_deduction would have be `true`, an array
+    would have been created. See @ref object(initializer_list_t)
+    for an example.
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows how JSON values are created from
+    initializer lists.,basic_json__list_init_t}
+
+    @sa @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+    @sa @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    basic_json(initializer_list_t init,
+               bool type_deduction = true,
+               value_t manual_type = value_t::array)
+    {
+        // check if each element is an array with two elements whose first
+        // element is a string
+        bool is_an_object = std::all_of(init.begin(), init.end(),
+                                        [](const detail::json_ref<basic_json>& element_ref)
+        {
+            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string();
+        });
+
+        // adjust type if type deduction is not wanted
+        if (!type_deduction)
+        {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array)
+            {
+                is_an_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
+            {
+                JSON_THROW(type_error::create(301, "cannot create object from initializer list"));
+            }
+        }
+
+        if (is_an_object)
+        {
+            // the initializer list is a list of pairs -> create object
+            m_type = value_t::object;
+            m_value = value_t::object;
+
+            std::for_each(init.begin(), init.end(), [this](const detail::json_ref<basic_json>& element_ref)
+            {
+                auto element = element_ref.moved_or_copied();
+                m_value.object->emplace(
+                    std::move(*((*element.m_value.array)[0].m_value.string)),
+                    std::move((*element.m_value.array)[1]));
+            });
+        }
+        else
+        {
+            // the initializer list describes an array -> create array
+            m_type = value_t::array;
+            m_value.array = create<array_t>(init.begin(), init.end());
+        }
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief explicitly create a binary array (without subtype)
+
+    Creates a JSON binary array value from a given binary container. Binary
+    values are part of various binary formats, such as CBOR, MessagePack, and
+    BSON. This constructor is used to create a value for serialization to those
+    formats.
+
+    @note Note, this function exists because of the difficulty in correctly
+    specifying the correct template overload in the standard value ctor, as both
+    JSON arrays and JSON binary arrays are backed with some form of a
+    `std::vector`. Because JSON binary arrays are a non-standard extension it
+    was decided that it would be best to prevent automatic initialization of a
+    binary array type, for backwards compatibility and so it does not happen on
+    accident.
+
+    @param[in] init container containing bytes to use as binary type
+
+    @return JSON binary array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @since version 3.8.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = init;
+        return res;
+    }
+
+    /*!
+    @brief explicitly create a binary array (with subtype)
+
+    Creates a JSON binary array value from a given binary container. Binary
+    values are part of various binary formats, such as CBOR, MessagePack, and
+    BSON. This constructor is used to create a value for serialization to those
+    formats.
+
+    @note Note, this function exists because of the difficulty in correctly
+    specifying the correct template overload in the standard value ctor, as both
+    JSON arrays and JSON binary arrays are backed with some form of a
+    `std::vector`. Because JSON binary arrays are a non-standard extension it
+    was decided that it would be best to prevent automatic initialization of a
+    binary array type, for backwards compatibility and so it does not happen on
+    accident.
+
+    @param[in] init container containing bytes to use as binary type
+    @param[in] subtype subtype to use in MessagePack and BSON
+
+    @return JSON binary array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @since version 3.8.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init, std::uint8_t subtype)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = binary_t(init, subtype);
+        return res;
+    }
+
+    /// @copydoc binary(const typename binary_t::container_type&)
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = std::move(init);
+        return res;
+    }
+
+    /// @copydoc binary(const typename binary_t::container_type&, std::uint8_t)
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init, std::uint8_t subtype)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = binary_t(std::move(init), subtype);
+        return res;
+    }
+
+    /*!
+    @brief explicitly create an array from an initializer list
+
+    Creates a JSON array value from a given initializer list. That is, given a
+    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
+    initializer list is empty, the empty array `[]` is created.
+
+    @note This function is only needed to express two edge cases that cannot
+    be realized with the initializer list constructor (@ref
+    basic_json(initializer_list_t, bool, value_t)). These cases
+    are:
+    1. creating an array whose elements are all pairs whose first element is a
+    string -- in this case, the initializer list constructor would create an
+    object, taking the first elements as keys
+    2. creating an empty array -- passing the empty initializer list to the
+    initializer list constructor yields an empty object
+
+    @param[in] init  initializer list with JSON values to create an array from
+    (optional)
+
+    @return JSON array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `array`
+    function.,array}
+
+    @sa @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json array(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /*!
+    @brief explicitly create an object from an initializer list
+
+    Creates a JSON object value from a given initializer list. The initializer
+    lists elements must be pairs, and their first elements must be strings. If
+    the initializer list is empty, the empty object `{}` is created.
+
+    @note This function is only added for symmetry reasons. In contrast to the
+    related function @ref array(initializer_list_t), there are
+    no cases which can only be expressed by this function. That is, any
+    initializer list @a init can also be passed to the initializer list
+    constructor @ref basic_json(initializer_list_t, bool, value_t).
+
+    @param[in] init  initializer list to create an object from (optional)
+
+    @return JSON object value
+
+    @throw type_error.301 if @a init is not a list of pairs whose first
+    elements are strings. In this case, no object can be created. When such a
+    value is passed to @ref basic_json(initializer_list_t, bool, value_t),
+    an array would have been created from the passed initializer list @a init.
+    See example below.
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `object`
+    function.,object}
+
+    @sa @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json object(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /*!
+    @brief construct an array with count copies of given value
+
+    Constructs a JSON array value by creating @a cnt copies of a passed value.
+    In case @a cnt is `0`, an empty array is created.
+
+    @param[in] cnt  the number of JSON copies of @a val to create
+    @param[in] val  the JSON value to copy
+
+    @post `std::distance(begin(),end()) == cnt` holds.
+
+    @complexity Linear in @a cnt.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows examples for the @ref
+    basic_json(size_type\, const basic_json&)
+    constructor.,basic_json__size_type_basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(size_type cnt, const basic_json& val)
+        : m_type(value_t::array)
+    {
+        m_value.array = create<array_t>(cnt, val);
+        assert_invariant();
+    }
+
+    /*!
+    @brief construct a JSON container given an iterator range
+
+    Constructs the JSON value with the contents of the range `[first, last)`.
+    The semantics depends on the different types a JSON value can have:
+    - In case of a null type, invalid_iterator.206 is thrown.
+    - In case of other primitive types (number, boolean, or string), @a first
+      must be `begin()` and @a last must be `end()`. In this case, the value is
+      copied. Otherwise, invalid_iterator.204 is thrown.
+    - In case of structured types (array, object), the constructor behaves as
+      similar versions for `std::vector` or `std::map`; that is, a JSON array
+      or object is constructed from the values in the range.
+
+    @tparam InputIT an input iterator type (@ref iterator or @ref
+    const_iterator)
+
+    @param[in] first begin of the range to copy from (included)
+    @param[in] last end of the range to copy from (excluded)
+
+    @pre Iterators @a first and @a last must be initialized. **This
+         precondition is enforced with an assertion (see warning).** If
+         assertions are switched off, a violation of this precondition yields
+         undefined behavior.
+
+    @pre Range `[first, last)` is valid. Usually, this precondition cannot be
+         checked efficiently. Only certain edge cases are detected; see the
+         description of the exceptions below. A violation of this precondition
+         yields undefined behavior.
+
+    @warning A precondition is enforced with a runtime assertion that will
+             result in calling `std::abort` if this precondition is not met.
+             Assertions can be disabled by defining `NDEBUG` at compile time.
+             See https://en.cppreference.com/w/cpp/error/assert for more
+             information.
+
+    @throw invalid_iterator.201 if iterators @a first and @a last are not
+    compatible (i.e., do not belong to the same JSON value). In this case,
+    the range `[first, last)` is undefined.
+    @throw invalid_iterator.204 if iterators @a first and @a last belong to a
+    primitive type (number, boolean, or string), but @a first does not point
+    to the first element any more. In this case, the range `[first, last)` is
+    undefined. See example code below.
+    @throw invalid_iterator.206 if iterators @a first and @a last belong to a
+    null value. In this case, the range `[first, last)` is undefined.
+
+    @complexity Linear in distance between @a first and @a last.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows several ways to create JSON values by
+    specifying a subrange with iterators.,basic_json__InputIt_InputIt}
+
+    @since version 1.0.0
+    */
+    template < class InputIT, typename std::enable_if <
+                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
+                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
+    basic_json(InputIT first, InputIT last)
+    {
+        JSON_ASSERT(first.m_object != nullptr);
+        JSON_ASSERT(last.m_object != nullptr);
+
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible"));
+        }
+
+        // copy type from first iterator
+        m_type = first.m_object->m_type;
+
+        // check if iterator range is complete for primitive values
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
+                                         || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = first.m_object->m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = first.m_object->m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = first.m_object->m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = first.m_object->m_value.boolean;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *first.m_object->m_value.string;
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object = create<object_t>(first.m_it.object_iterator,
+                                                  last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array = create<array_t>(first.m_it.array_iterator,
+                                                last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value = *first.m_object->m_value.binary;
+                break;
+            }
+
+            default:
+                JSON_THROW(invalid_iterator::create(206, "cannot construct with iterators from " +
+                                                    std::string(first.m_object->type_name())));
+        }
+
+        assert_invariant();
+    }
+
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    template<typename JsonRef,
+             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
+                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
+    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}
+
+    /*!
+    @brief copy constructor
+
+    Creates a copy of a given JSON value.
+
+    @param[in] other  the JSON value to copy
+
+    @post `*this == other`
+
+    @complexity Linear in the size of @a other.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - As postcondition, it holds: `other == basic_json(other)`.
+
+    @liveexample{The following code shows an example for the copy
+    constructor.,basic_json__basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(const basic_json& other)
+        : m_type(other.m_type)
+    {
+        // check of passed value is valid
+        other.assert_invariant();
+
+        switch (m_type)
+        {
+            case value_t::object:
+            {
+                m_value = *other.m_value.object;
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value = *other.m_value.array;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *other.m_value.string;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value = other.m_value.boolean;
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                m_value = other.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value = other.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value = other.m_value.number_float;
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value = *other.m_value.binary;
+                break;
+            }
+
+            default:
+                break;
+        }
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief move constructor
+
+    Move constructor. Constructs a JSON value with the contents of the given
+    value @a other using move semantics. It "steals" the resources from @a
+    other and leaves it as JSON null value.
+
+    @param[in,out] other  value to move to this object
+
+    @post `*this` has the same value as @a other before the call.
+    @post @a other is a JSON null value.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible)
+    requirements.
+
+    @liveexample{The code below shows the move constructor explicitly called
+    via std::move.,basic_json__moveconstructor}
+
+    @since version 1.0.0
+    */
+    basic_json(basic_json&& other) noexcept
+        : m_type(std::move(other.m_type)),
+          m_value(std::move(other.m_value))
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        // invalidate payload
+        other.m_type = value_t::null;
+        other.m_value = {};
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief copy assignment
+
+    Copy assignment operator. Copies a JSON value via the "copy and swap"
+    strategy: It is expressed in terms of the copy constructor, destructor,
+    and the `swap()` member function.
+
+    @param[in] other  value to copy from
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+
+    @liveexample{The code below shows and example for the copy assignment. It
+    creates a copy of value `a` which is then swapped with `b`. Finally\, the
+    copy of `a` (which is the null value after the swap) is
+    destroyed.,basic_json__copyassignment}
+
+    @since version 1.0.0
+    */
+    basic_json& operator=(basic_json other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        using std::swap;
+        swap(m_type, other.m_type);
+        swap(m_value, other.m_value);
+
+        assert_invariant();
+        return *this;
+    }
+
+    /*!
+    @brief destructor
+
+    Destroys the JSON value and frees all allocated memory.
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - All stored elements are destroyed and all memory is freed.
+
+    @since version 1.0.0
+    */
+    ~basic_json() noexcept
+    {
+        assert_invariant();
+        m_value.destroy(m_type);
+    }
+
+    /// @}
+
+  public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// Functions to inspect the type of a JSON value.
+    /// @{
+
+    /*!
+    @brief serialization
+
+    Serialization function for JSON values. The function tries to mimic
+    Python's `json.dumps()` function, and currently supports its @a indent
+    and @a ensure_ascii parameters.
+
+    @param[in] indent If indent is nonnegative, then array elements and object
+    members will be pretty-printed with that indent level. An indent level of
+    `0` will only insert newlines. `-1` (the default) selects the most compact
+    representation.
+    @param[in] indent_char The character to use for indentation if @a indent is
+    greater than `0`. The default is ` ` (space).
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] error_handler  how to react on decoding errors; there are three
+    possible values: `strict` (throws and exception in case a decoding error
+    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
+    and `ignore` (ignore invalid UTF-8 sequences during serialization; all
+    bytes are copied to the output unchanged).
+
+    @return string containing the serialization of the JSON value
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded and @a error_handler is set to strict
+
+    @note Binary values are serialized as object containing two keys:
+      - "bytes": an array of bytes as integers
+      - "subtype": the subtype as integer or "null" if the binary has no subtype
+
+    @complexity Linear.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @liveexample{The following example shows the effect of different @a indent\,
+    @a indent_char\, and @a ensure_ascii parameters to the result of the
+    serialization.,dump}
+
+    @see https://docs.python.org/2/library/json.html#json.dump
+
+    @since version 1.0.0; indentation character @a indent_char, option
+           @a ensure_ascii and exceptions added in version 3.0.0; error
+           handlers added in version 3.4.0; serialization of binary values added
+           in version 3.8.0.
+    */
+    string_t dump(const int indent = -1,
+                  const char indent_char = ' ',
+                  const bool ensure_ascii = false,
+                  const error_handler_t error_handler = error_handler_t::strict) const
+    {
+        string_t result;
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
+
+        if (indent >= 0)
+        {
+            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
+        }
+        else
+        {
+            s.dump(*this, false, ensure_ascii, 0);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief return the type of the JSON value (explicit)
+
+    Return the type of the JSON value as a value from the @ref value_t
+    enumeration.
+
+    @return the type of the JSON value
+            Value type                | return value
+            ------------------------- | -------------------------
+            null                      | value_t::null
+            boolean                   | value_t::boolean
+            string                    | value_t::string
+            number (integer)          | value_t::number_integer
+            number (unsigned integer) | value_t::number_unsigned
+            number (floating-point)   | value_t::number_float
+            object                    | value_t::object
+            array                     | value_t::array
+            binary                    | value_t::binary
+            discarded                 | value_t::discarded
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `type()` for all JSON
+    types.,type}
+
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
+    @sa @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr value_t type() const noexcept
+    {
+        return m_type;
+    }
+
+    /*!
+    @brief return whether type is primitive
+
+    This function returns true if and only if the JSON type is primitive
+    (string, number, boolean, or null).
+
+    @return `true` if type is primitive (string, number, boolean, or null),
+    `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_primitive()` for all JSON
+    types.,is_primitive}
+
+    @sa @ref is_structured() -- returns whether JSON value is structured
+    @sa @ref is_null() -- returns whether JSON value is `null`
+    @sa @ref is_string() -- returns whether JSON value is a string
+    @sa @ref is_boolean() -- returns whether JSON value is a boolean
+    @sa @ref is_number() -- returns whether JSON value is a number
+    @sa @ref is_binary() -- returns whether JSON value is a binary array
+
+    @since version 1.0.0
+    */
+    constexpr bool is_primitive() const noexcept
+    {
+        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
+    }
+
+    /*!
+    @brief return whether type is structured
+
+    This function returns true if and only if the JSON type is structured
+    (array or object).
+
+    @return `true` if type is structured (array or object), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_structured()` for all JSON
+    types.,is_structured}
+
+    @sa @ref is_primitive() -- returns whether value is primitive
+    @sa @ref is_array() -- returns whether value is an array
+    @sa @ref is_object() -- returns whether value is an object
+
+    @since version 1.0.0
+    */
+    constexpr bool is_structured() const noexcept
+    {
+        return is_array() || is_object();
+    }
+
+    /*!
+    @brief return whether value is null
+
+    This function returns true if and only if the JSON value is null.
+
+    @return `true` if type is null, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_null()` for all JSON
+    types.,is_null}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_null() const noexcept
+    {
+        return m_type == value_t::null;
+    }
+
+    /*!
+    @brief return whether value is a boolean
+
+    This function returns true if and only if the JSON value is a boolean.
+
+    @return `true` if type is boolean, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_boolean()` for all JSON
+    types.,is_boolean}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_boolean() const noexcept
+    {
+        return m_type == value_t::boolean;
+    }
+
+    /*!
+    @brief return whether value is a number
+
+    This function returns true if and only if the JSON value is a number. This
+    includes both integer (signed and unsigned) and floating-point values.
+
+    @return `true` if type is number (regardless whether integer, unsigned
+    integer or floating-type), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number()` for all JSON
+    types.,is_number}
+
+    @sa @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number() const noexcept
+    {
+        return is_number_integer() || is_number_float();
+    }
+
+    /*!
+    @brief return whether value is an integer number
+
+    This function returns true if and only if the JSON value is a signed or
+    unsigned integer number. This excludes floating-point values.
+
+    @return `true` if type is an integer or unsigned integer number, `false`
+    otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_integer()` for all
+    JSON types.,is_number_integer}
+
+    @sa @ref is_number() -- check if value is a number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_integer() const noexcept
+    {
+        return m_type == value_t::number_integer || m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is an unsigned integer number
+
+    This function returns true if and only if the JSON value is an unsigned
+    integer number. This excludes floating-point and signed integer values.
+
+    @return `true` if type is an unsigned integer number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_unsigned()` for all
+    JSON types.,is_number_unsigned}
+
+    @sa @ref is_number() -- check if value is a number
+    @sa @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 2.0.0
+    */
+    constexpr bool is_number_unsigned() const noexcept
+    {
+        return m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is a floating-point number
+
+    This function returns true if and only if the JSON value is a
+    floating-point number. This excludes signed and unsigned integer values.
+
+    @return `true` if type is a floating-point number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_float()` for all
+    JSON types.,is_number_float}
+
+    @sa @ref is_number() -- check if value is number
+    @sa @ref is_number_integer() -- check if value is an integer number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_float() const noexcept
+    {
+        return m_type == value_t::number_float;
+    }
+
+    /*!
+    @brief return whether value is an object
+
+    This function returns true if and only if the JSON value is an object.
+
+    @return `true` if type is object, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_object()` for all JSON
+    types.,is_object}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_object() const noexcept
+    {
+        return m_type == value_t::object;
+    }
+
+    /*!
+    @brief return whether value is an array
+
+    This function returns true if and only if the JSON value is an array.
+
+    @return `true` if type is array, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_array()` for all JSON
+    types.,is_array}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_array() const noexcept
+    {
+        return m_type == value_t::array;
+    }
+
+    /*!
+    @brief return whether value is a string
+
+    This function returns true if and only if the JSON value is a string.
+
+    @return `true` if type is string, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_string()` for all JSON
+    types.,is_string}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_string() const noexcept
+    {
+        return m_type == value_t::string;
+    }
+
+    /*!
+    @brief return whether value is a binary array
+
+    This function returns true if and only if the JSON value is a binary array.
+
+    @return `true` if type is binary array, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_binary()` for all JSON
+    types.,is_binary}
+
+    @since version 3.8.0
+    */
+    constexpr bool is_binary() const noexcept
+    {
+        return m_type == value_t::binary;
+    }
+
+    /*!
+    @brief return whether value is discarded
+
+    This function returns true if and only if the JSON value was discarded
+    during parsing with a callback function (see @ref parser_callback_t).
+
+    @note This function will always be `false` for JSON values after parsing.
+    That is, discarded values can only occur during parsing, but will be
+    removed when inside a structured value or replaced by null in other cases.
+
+    @return `true` if type is discarded, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_discarded()` for all JSON
+    types.,is_discarded}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_discarded() const noexcept
+    {
+        return m_type == value_t::discarded;
+    }
+
+    /*!
+    @brief return the type of the JSON value (implicit)
+
+    Implicitly return the type of the JSON value as a value from the @ref
+    value_t enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies the @ref value_t operator for
+    all JSON types.,operator__value_t}
+
+    @sa @ref type() -- return the type of the JSON value (explicit)
+    @sa @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr operator value_t() const noexcept
+    {
+        return m_type;
+    }
+
+    /// @}
+
+  private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get a boolean (explicit)
+    boolean_t get_impl(boolean_t* /*unused*/) const
+    {
+        if (JSON_HEDLEY_LIKELY(is_boolean()))
+        {
+            return m_value.boolean;
+        }
+
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(type_name())));
+    }
+
+    /// get a pointer to the value (object)
+    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
+    {
+        return is_binary() ? m_value.binary : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
+    {
+        return is_binary() ? m_value.binary : nullptr;
+    }
+
+    /*!
+    @brief helper function to implement get_ref()
+
+    This function helps to implement get_ref() without code duplication for
+    const and non-const overloads
+
+    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
+
+    @throw type_error.303 if ReferenceType does not match underlying value
+    type of the current JSON
+    */
+    template<typename ReferenceType, typename ThisType>
+    static ReferenceType get_ref_impl(ThisType& obj)
+    {
+        // delegate the call to get_ptr<>()
+        auto ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
+
+        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
+        {
+            return *ptr;
+        }
+
+        JSON_THROW(type_error::create(303, "incompatible ReferenceType for get_ref, actual type is " + std::string(obj.type_name())));
+    }
+
+  public:
+    /// @name value access
+    /// Direct access to the stored value of a JSON value.
+    /// @{
+
+    /*!
+    @brief get special-case overload
+
+    This overloads avoids a lot of template boilerplate, it can be seen as the
+    identity method
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this
+
+    @complexity Constant.
+
+    @since version 2.1.0
+    */
+    template<typename BasicJsonType, detail::enable_if_t<
+                 std::is_same<typename std::remove_const<BasicJsonType>::type, basic_json_t>::value,
+                 int> = 0>
+    basic_json get() const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads converts the current @ref basic_json in a different
+    @ref basic_json type
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this, converted into @tparam BasicJsonType
+
+    @complexity Depending on the implementation of the called `from_json()`
+                method.
+
+    @since version 3.2.0
+    */
+    template < typename BasicJsonType, detail::enable_if_t <
+                   !std::is_same<BasicJsonType, basic_json>::value&&
+                   detail::is_basic_json<BasicJsonType>::value, int > = 0 >
+    BasicJsonType get() const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType ret;
+    JSONSerializer<ValueType>::from_json(*this, ret);
+    return ret;
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+    - @ref json_serializer<ValueType> does not have a `from_json()` method of
+      the form `ValueType from_json(const basic_json&)`
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @since version 2.1.0
+    */
+    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+               detail::enable_if_t <
+                   !detail::is_basic_json<ValueType>::value &&
+                   detail::has_from_json<basic_json_t, ValueType>::value &&
+                   !detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType get() const noexcept(noexcept(
+                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
+    {
+        // we cannot static_assert on ValueTypeCV being non-const, because
+        // there is support for get<const basic_json_t>(), which is why we
+        // still need the uncvref
+        static_assert(!std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        static_assert(std::is_default_constructible<ValueType>::value,
+                      "types must be DefaultConstructible when used with get()");
+
+        ValueType ret;
+        JSONSerializer<ValueType>::from_json(*this, ret);
+        return ret;
+    }
+
+    /*!
+    @brief get a value (explicit); special case
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    return JSONSerializer<ValueTypeCV>::from_json(*this);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json and
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `ValueType from_json(const basic_json&)`
+
+    @note If @ref json_serializer<ValueType> has both overloads of
+    `from_json()`, this one is chosen.
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @since version 2.1.0
+    */
+    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+               detail::enable_if_t < !std::is_same<basic_json_t, ValueType>::value &&
+                                     detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                                     int > = 0 >
+    ValueType get() const noexcept(noexcept(
+                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
+    {
+        static_assert(!std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        return JSONSerializer<ValueType>::from_json(*this);
+    }
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value.
+    The value is filled into the input parameter by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType v;
+    JSONSerializer<ValueType>::from_json(*this, v);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+
+    @tparam ValueType the input parameter type.
+
+    @return the input parameter, allowing chaining calls.
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get_to}
+
+    @since version 3.3.0
+    */
+    template < typename ValueType,
+               detail::enable_if_t <
+                   !detail::is_basic_json<ValueType>::value&&
+                   detail::has_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType & get_to(ValueType& v) const noexcept(noexcept(
+                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<ValueType>::from_json(*this, v);
+        return v;
+    }
+
+    // specialization to allow to call get_to with a basic_json value
+    // see https://github.com/nlohmann/json/issues/2175
+    template<typename ValueType,
+             detail::enable_if_t <
+                 detail::is_basic_json<ValueType>::value,
+                 int> = 0>
+    ValueType & get_to(ValueType& v) const
+    {
+        v = *this;
+        return v;
+    }
+
+    template <
+        typename T, std::size_t N,
+        typename Array = T (&)[N],
+        detail::enable_if_t <
+            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
+    Array get_to(T (&v)[N]) const
+    noexcept(noexcept(JSONSerializer<Array>::from_json(
+                          std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<Array>::from_json(*this, v);
+        return v;
+    }
+
+
+    /*!
+    @brief get a pointer value (implicit)
+
+    Implicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
+    assertion.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get_ptr}
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+    @copydoc get_ptr()
+    */
+    template < typename PointerType, typename std::enable_if <
+                   std::is_pointer<PointerType>::value&&
+                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
+    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning The pointer becomes invalid if the underlying JSON object
+    changes.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa @ref get_ptr() for explicit pointer-member access
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    constexpr auto get() const noexcept -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+
+    Implicit reference access to the internally stored JSON value. No copies
+    are made.
+
+    @warning Writing data to the referee of the result yields an undefined
+    state.
+
+    @tparam ReferenceType reference type; must be a reference to @ref array_t,
+    @ref object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or
+    @ref number_float_t. Enforced by static assertion.
+
+    @return reference to the internally stored JSON value if the requested
+    reference type @a ReferenceType fits to the JSON value; throws
+    type_error.303 otherwise
+
+    @throw type_error.303 in case passed type @a ReferenceType is incompatible
+    with the stored JSON value; see example below
+
+    @complexity Constant.
+
+    @liveexample{The example shows several calls to `get_ref()`.,get_ref}
+
+    @since version 1.1.0
+    */
+    template<typename ReferenceType, typename std::enable_if<
+                 std::is_reference<ReferenceType>::value, int>::type = 0>
+    ReferenceType get_ref()
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+    @copydoc get_ref()
+    */
+    template < typename ReferenceType, typename std::enable_if <
+                   std::is_reference<ReferenceType>::value&&
+                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
+    ReferenceType get_ref() const
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implicit type conversion between the JSON value and a compatible value.
+    The call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays. The character type of @ref string_t
+    as well as an initializer list of this type is excluded to avoid
+    ambiguities as these types implicitly convert to `std::string`.
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw type_error.302 in case passed type @a ValueType is incompatible
+    to the JSON value type (e.g., the JSON value is of type boolean, but a
+    string is requested); see example below
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+
+    @since version 1.0.0
+    */
+    template < typename ValueType, typename std::enable_if <
+                   !std::is_pointer<ValueType>::value&&
+                   !std::is_same<ValueType, detail::json_ref<basic_json>>::value&&
+                   !std::is_same<ValueType, typename string_t::value_type>::value&&
+                   !detail::is_basic_json<ValueType>::value
+                   && !std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>::value
+#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
+                   && !std::is_same<ValueType, typename std::string_view>::value
+#endif
+                   && detail::is_detected<detail::get_template_function, const basic_json_t&, ValueType>::value
+                   , int >::type = 0 >
+    JSON_EXPLICIT operator ValueType() const
+    {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /*!
+    @return reference to the binary value
+
+    @throw type_error.302 if the value is not binary
+
+    @sa @ref is_binary() to check if the value is binary
+
+    @since version 3.8.0
+    */
+    binary_t& get_binary()
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name())));
+        }
+
+        return *get_ptr<binary_t*>();
+    }
+
+    /// @copydoc get_binary()
+    const binary_t& get_binary() const
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name())));
+        }
+
+        return *get_ptr<const binary_t*>();
+    }
+
+    /// @}
+
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// Access to the JSON value.
+    /// @{
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__size_type}
+    */
+    reference at(size_type idx)
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a const reference to the element at specified location @a idx,
+    with bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__size_type_const}
+    */
+    const_reference at(size_type idx) const
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__object_t_key_type}
+    */
+    reference at(const typename object_t::key_type& key)
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return m_value.object->at(key);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a const reference to the element at with specified key @a key,
+    with bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__object_t_key_type_const}
+    */
+    const_reference at(const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return m_value.object->at(key);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a reference to the element at specified location @a idx.
+
+    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
+    then the array is silently filled up with `null` values to make `idx` a
+    valid reference to the last stored element.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array or null; in that
+    cases, using the [] operator with an index makes no sense.
+
+    @complexity Constant if @a idx is in the range of the array. Otherwise
+    linear in `idx - size()`.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `[]` operator. Note the addition of `null`
+    values.,operatorarray__size_type}
+
+    @since version 1.0.0
+    */
+    reference operator[](size_type idx)
+    {
+        // implicitly convert null value to an empty array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value.array = create<array_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // fill up array with null values if given idx is outside range
+            if (idx >= m_value.array->size())
+            {
+                m_value.array->insert(m_value.array->end(),
+                                      idx - m_value.array->size() + 1,
+                                      basic_json());
+            }
+
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a const reference to the element at specified location @a idx.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array; in that case,
+    using the [] operator with an index makes no sense.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    the `[]` operator.,operatorarray__size_type_const}
+
+    @since version 1.0.0
+    */
+    const_reference operator[](size_type idx) const
+    {
+        // const operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    reference operator[](const typename object_t::key_type& key)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->operator[](key);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    const_reference operator[](const typename object_t::key_type& key) const
+    {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    reference operator[](T* key)
+    {
+        // implicitly convert null to object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->operator[](key);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    const_reference operator[](T* key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(key);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const typename object_t::key_type&), this function
+    does not throw if the given key @a key was not found.
+
+    @note Unlike @ref operator[](const typename object_t::key_type& key), this
+    function does not implicitly add an element to the position defined by @a
+    key. This function is furthermore also applicable to const objects.
+
+    @param[in] key  key of the element to access
+    @param[in] default_value  the value to return if @a key is not found
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a key
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+
+    @since version 1.0.0
+    */
+    // using std::is_convertible in a std::enable_if will fail when using explicit conversions
+    template < class ValueType, typename std::enable_if <
+                   detail::is_getable<basic_json_t, ValueType>::value
+                   && !std::is_same<value_t, ValueType>::value, int >::type = 0 >
+    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end())
+            {
+                return it->template get<ValueType>();
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const typename object_t::key_type&, const ValueType&) const
+    */
+    string_t value(const typename object_t::key_type& key, const char* default_value) const
+    {
+        return value(key, string_t(default_value));
+    }
+
+    /*!
+    @brief access specified object element via JSON Pointer with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(ptr);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const json_pointer&), this function does not throw
+    if the given key @a key was not found.
+
+    @param[in] ptr  a JSON pointer to the element to access
+    @param[in] default_value  the value to return if @a ptr found no value
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a ptr
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value_ptr}
+
+    @sa @ref operator[](const json_pointer&) for unchecked access by reference
+
+    @since version 2.0.2
+    */
+    template<class ValueType, typename std::enable_if<
+                 detail::is_getable<basic_json_t, ValueType>::value, int>::type = 0>
+    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY
+            {
+                return ptr.get_checked(this).template get<ValueType>();
+            }
+            JSON_INTERNAL_CATCH (out_of_range&)
+            {
+                return default_value;
+            }
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const json_pointer&, ValueType) const
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    string_t value(const json_pointer& ptr, const char* default_value) const
+    {
+        return value(ptr, string_t(default_value));
+    }
+
+    /*!
+    @brief access the first element
+
+    Returns a reference to the first element in the container. For a JSON
+    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.
+
+    @return In case of a structured type (array or object), a reference to the
+    first element is returned. In case of number, string, boolean, or binary
+    values, a reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on `null` value
+
+    @liveexample{The following code shows an example for `front()`.,front}
+
+    @sa @ref back() -- access the last element
+
+    @since version 1.0.0
+    */
+    reference front()
+    {
+        return *begin();
+    }
+
+    /*!
+    @copydoc basic_json::front()
+    */
+    const_reference front() const
+    {
+        return *cbegin();
+    }
+
+    /*!
+    @brief access the last element
+
+    Returns a reference to the last element in the container. For a JSON
+    container `c`, the expression `c.back()` is equivalent to
+    @code {.cpp}
+    auto tmp = c.end();
+    --tmp;
+    return *tmp;
+    @endcode
+
+    @return In case of a structured type (array or object), a reference to the
+    last element is returned. In case of number, string, boolean, or binary
+    values, a reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on a `null` value. See example
+    below.
+
+    @liveexample{The following code shows an example for `back()`.,back}
+
+    @sa @ref front() -- access the first element
+
+    @since version 1.0.0
+    */
+    reference back()
+    {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @copydoc basic_json::back()
+    */
+    const_reference back() const
+    {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @brief remove element given an iterator
+
+    Removes the element specified by iterator @a pos. The iterator @a pos must
+    be valid and dereferenceable. Thus the `end()` iterator (which is valid,
+    but is not dereferenceable) cannot be used as a value for @a pos.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] pos iterator to the element to remove
+    @return Iterator following the last removed element. If the iterator @a
+    pos refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.202 if called on an iterator which does not belong
+    to the current JSON value; example: `"iterator does not fit current
+    value"`
+    @throw invalid_iterator.205 if called on a primitive type with invalid
+    iterator (i.e., any iterator which is not `begin()`); example: `"iterator
+    out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: amortized constant
+    - arrays: linear in distance between @a pos and the end of the container
+    - strings and binary: linear in the length of the member
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType}
+
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template < class IteratorType, typename std::enable_if <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
+               = 0 >
+    IteratorType erase(IteratorType pos)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
+                {
+                    JSON_THROW(invalid_iterator::create(205, "iterator out of range"));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
+                    m_value.binary = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove elements given an iterator range
+
+    Removes the element specified by the range `[first; last)`. The iterator
+    @a first does not need to be dereferenceable if `first == last`: erasing
+    an empty range is a no-op.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] first iterator to the beginning of the range to remove
+    @param[in] last iterator past the end of the range to remove
+    @return Iterator following the last removed element. If the iterator @a
+    second refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.203 if called on iterators which does not belong
+    to the current JSON value; example: `"iterators do not fit current value"`
+    @throw invalid_iterator.204 if called on a primitive type with invalid
+    iterators (i.e., if `first != begin()` and `last != end()`); example:
+    `"iterators out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: `log(size()) + std::distance(first, last)`
+    - arrays: linear in the distance between @a first and @a last, plus linear
+      in the distance between @a last and end of the container
+    - strings and binary: linear in the length of the member
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType_IteratorType}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template < class IteratorType, typename std::enable_if <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
+               = 0 >
+    IteratorType erase(IteratorType first, IteratorType last)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value"));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
+                                       || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
+                    m_value.binary = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
+                                              last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
+                                             last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove element from a JSON object given a key
+
+    Removes elements from a JSON object with the key value @a key.
+
+    @param[in] key value of the elements to remove
+
+    @return Number of elements removed. If @a ObjectType is the default
+    `std::map` type, the return value will always be `0` (@a key was not
+    found) or `1` (@a key was found).
+
+    @post References and iterators to the erased elements are invalidated.
+    Other references and iterators are not affected.
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+
+    @complexity `log(size()) + count(key)`
+
+    @liveexample{The example shows the effect of `erase()`.,erase__key_type}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    size_type erase(const typename object_t::key_type& key)
+    {
+        // this erase only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->erase(key);
+        }
+
+        JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief remove element from a JSON array given an index
+
+    Removes element from a JSON array at the index @a idx.
+
+    @param[in] idx index of the element to remove
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+    @throw out_of_range.401 when `idx >= size()`; example: `"array index 17
+    is out of range"`
+
+    @complexity Linear in distance between @a idx and the end of the container.
+
+    @liveexample{The example shows the effect of `erase()`.,erase__size_type}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+
+    @since version 1.0.0
+    */
+    void erase(const size_type idx)
+    {
+        // this erase only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
+            {
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+
+            m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
+        }
+        else
+        {
+            JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+    }
+
+    /// @}
+
+
+    ////////////
+    // lookup //
+    ////////////
+
+    /// @name lookup
+    /// @{
+
+    /*!
+    @brief find an element in a JSON object
+
+    Finds an element in a JSON object with key equivalent to @a key. If the
+    element is not found or the JSON value is not an object, end() is
+    returned.
+
+    @note This method always returns @ref end() when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value of the element to search for.
+
+    @return Iterator to an element with key equivalent to @a key. If no such
+    element is found or the JSON value is not an object, past-the-end (see
+    @ref end()) iterator is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `find()` is used.,find__key_type}
+
+    @sa @ref contains(KeyT&&) const -- checks whether a key exists
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    iterator find(KeyT&& key)
+    {
+        auto result = end();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief find an element in a JSON object
+    @copydoc find(KeyT&&)
+    */
+    template<typename KeyT>
+    const_iterator find(KeyT&& key) const
+    {
+        auto result = cend();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief returns the number of occurrences of a key in a JSON object
+
+    Returns the number of elements with key @a key. If ObjectType is the
+    default `std::map` type, the return value will always be `0` (@a key was
+    not found) or `1` (@a key was found).
+
+    @note This method always returns `0` when executed on a JSON type that is
+          not an object.
+
+    @param[in] key key value of the element to count
+
+    @return Number of elements with key @a key. If the JSON value is not an
+    object, the return value will be `0`.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `count()` is used.,count}
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    size_type count(KeyT&& key) const
+    {
+        // return 0 for all nonobject types
+        return is_object() ? m_value.object->count(std::forward<KeyT>(key)) : 0;
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object
+
+    Check whether an element exists in a JSON object with key equivalent to
+    @a key. If the element is not found or the JSON value is not an object,
+    false is returned.
+
+    @note This method always returns false when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value to check its existence.
+
+    @return true if an element with specified @a key exists. If no such
+    element with such key is found or the JSON value is not an object,
+    false is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains}
+
+    @sa @ref find(KeyT&&) -- returns an iterator to an object element
+    @sa @ref contains(const json_pointer&) const -- checks the existence for a JSON pointer
+
+    @since version 3.6.0
+    */
+    template < typename KeyT, typename std::enable_if <
+                   !std::is_same<typename std::decay<KeyT>::type, json_pointer>::value, int >::type = 0 >
+    bool contains(KeyT && key) const
+    {
+        return is_object() && m_value.object->find(std::forward<KeyT>(key)) != m_value.object->end();
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object given a JSON pointer
+
+    Check whether the given JSON pointer @a ptr can be resolved in the current
+    JSON value.
+
+    @note This method can be executed on any JSON value type.
+
+    @param[in] ptr JSON pointer to check its existence.
+
+    @return true if the JSON pointer can be resolved to a stored value, false
+    otherwise.
+
+    @post If `j.contains(ptr)` returns true, it is safe to call `j[ptr]`.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains_json_pointer}
+
+    @sa @ref contains(KeyT &&) const -- checks the existence of a key
+
+    @since version 3.7.0
+    */
+    bool contains(const json_pointer& ptr) const
+    {
+        return ptr.contains(this);
+    }
+
+    /// @}
+
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /*!
+    @brief returns an iterator to the first element
+
+    Returns an iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `begin()`.,begin}
+
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    iterator begin() noexcept
+    {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cbegin()
+    */
+    const_iterator begin() const noexcept
+    {
+        return cbegin();
+    }
+
+    /*!
+    @brief returns a const iterator to the first element
+
+    Returns a const iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
+
+    @liveexample{The following code shows an example for `cbegin()`.,cbegin}
+
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    const_iterator cbegin() const noexcept
+    {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to one past the last element
+
+    Returns an iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `end()`.,end}
+
+    @sa @ref cend() -- returns a const iterator to the end
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    iterator end() noexcept
+    {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cend()
+    */
+    const_iterator end() const noexcept
+    {
+        return cend();
+    }
+
+    /*!
+    @brief returns a const iterator to one past the last element
+
+    Returns a const iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
+
+    @liveexample{The following code shows an example for `cend()`.,cend}
+
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_iterator cend() const noexcept
+    {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-beginning
+
+    Returns an iterator to the reverse-beginning; that is, the last element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(end())`.
+
+    @liveexample{The following code shows an example for `rbegin()`.,rbegin}
+
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    reverse_iterator rbegin() noexcept
+    {
+        return reverse_iterator(end());
+    }
+
+    /*!
+    @copydoc basic_json::crbegin()
+    */
+    const_reverse_iterator rbegin() const noexcept
+    {
+        return crbegin();
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-end
+
+    Returns an iterator to the reverse-end; that is, one before the first
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(begin())`.
+
+    @liveexample{The following code shows an example for `rend()`.,rend}
+
+    @sa @ref crend() -- returns a const reverse iterator to the end
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    reverse_iterator rend() noexcept
+    {
+        return reverse_iterator(begin());
+    }
+
+    /*!
+    @copydoc basic_json::crend()
+    */
+    const_reverse_iterator rend() const noexcept
+    {
+        return crend();
+    }
+
+    /*!
+    @brief returns a const reverse iterator to the last element
+
+    Returns a const iterator to the reverse-beginning; that is, the last
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
+
+    @liveexample{The following code shows an example for `crbegin()`.,crbegin}
+
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    /*!
+    @brief returns a const reverse iterator to one before the first
+
+    Returns a const reverse iterator to the reverse-end; that is, one before
+    the first element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
+
+    @liveexample{The following code shows an example for `crend()`.,crend}
+
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+  public:
+    /*!
+    @brief wrapper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without iterator_wrapper:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without iterator proxy:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with iterator proxy:
+
+    @code{cpp}
+    for (auto it : json::iterator_wrapper(j_object))
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example).
+
+    @param[in] ref  reference to a JSON value
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the wrapper is used,iterator_wrapper}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @note The name of this function is not yet final and may change in the
+    future.
+
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use @ref items() instead;
+                that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @copydoc iterator_wrapper(reference)
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @brief helper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without `items()` function:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without `items()` function:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with `items()` function:
+
+    @code{cpp}
+    for (auto& el : j_object.items())
+    {
+        std::cout << "key: " << el.key() << ", value:" << el.value() << '\n';
+    }
+    @endcode
+
+    The `items()` function also allows to use
+    [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding)
+    (C++17):
+
+    @code{cpp}
+    for (auto& [key, val] : j_object.items())
+    {
+        std::cout << "key: " << key << ", value:" << val << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example). For primitive types (e.g., numbers),
+          `key()` returns an empty string.
+
+    @warning Using `items()` on temporary objects is dangerous. Make sure the
+             object's lifetime exeeds the iteration. See
+             <https://github.com/nlohmann/json/issues/2040> for more
+             information.
+
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the function is used.,items}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 3.1.0, structured bindings support since 3.5.0.
+    */
+    iteration_proxy<iterator> items() noexcept
+    {
+        return iteration_proxy<iterator>(*this);
+    }
+
+    /*!
+    @copydoc items()
+    */
+    iteration_proxy<const_iterator> items() const noexcept
+    {
+        return iteration_proxy<const_iterator>(*this);
+    }
+
+    /// @}
+
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /*!
+    @brief checks whether the container is empty.
+
+    Checks if a JSON value has no elements (i.e. whether its @ref size is `0`).
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `true`
+            boolean     | `false`
+            string      | `false`
+            number      | `false`
+            binary      | `false`
+            object      | result of function `object_t::empty()`
+            array       | result of function `array_t::empty()`
+
+    @liveexample{The following code uses `empty()` to check if a JSON
+    object contains any elements.,empty}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `empty()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return whether a string stored as JSON value
+    is empty - it returns whether the JSON container itself is empty which is
+    false in the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `begin() == end()`.
+
+    @sa @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    bool empty() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return true;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::empty()
+                return m_value.array->empty();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::empty()
+                return m_value.object->empty();
+            }
+
+            default:
+            {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the number of elements
+
+    Returns the number of elements in a JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0`
+            boolean     | `1`
+            string      | `1`
+            number      | `1`
+            binary      | `1`
+            object      | result of function object_t::size()
+            array       | result of function array_t::size()
+
+    @liveexample{The following code calls `size()` on the different value
+    types.,size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their size() functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return the length of a string stored as JSON
+    value - it returns the number of elements in the JSON value which is 1 in
+    the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `std::distance(begin(), end())`.
+
+    @sa @ref empty() -- checks whether the container is empty
+    @sa @ref max_size() -- returns the maximal number of elements
+
+    @since version 1.0.0
+    */
+    size_type size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return 0;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::size()
+                return m_value.array->size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::size()
+                return m_value.object->size();
+            }
+
+            default:
+            {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the maximum possible number of elements
+
+    Returns the maximum number of elements a JSON value is able to hold due to
+    system or library implementation limitations, i.e. `std::distance(begin(),
+    end())` for the JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0` (same as `size()`)
+            boolean     | `1` (same as `size()`)
+            string      | `1` (same as `size()`)
+            number      | `1` (same as `size()`)
+            binary      | `1` (same as `size()`)
+            object      | result of function `object_t::max_size()`
+            array       | result of function `array_t::max_size()`
+
+    @liveexample{The following code calls `max_size()` on the different value
+    types. Note the output is implementation specific.,max_size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `max_size()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of returning `b.size()` where `b` is the largest
+      possible JSON value.
+
+    @sa @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    size_type max_size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::array:
+            {
+                // delegate call to array_t::max_size()
+                return m_value.array->max_size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::max_size()
+                return m_value.object->max_size();
+            }
+
+            default:
+            {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /*!
+    @brief clears the contents
+
+    Clears the content of a JSON value and resets it to the default value as
+    if @ref basic_json(value_t) would have been called with the current value
+    type from @ref type():
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    binary      | An empty byte vector
+    object      | `{}`
+    array       | `[]`
+
+    @post Has the same effect as calling
+    @code {.cpp}
+    *this = basic_json(type());
+    @endcode
+
+    @liveexample{The example below shows the effect of `clear()` to different
+    JSON types.,clear}
+
+    @complexity Linear in the size of the JSON value.
+
+    @iterators All iterators, pointers and references related to this container
+               are invalidated.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @sa @ref basic_json(value_t) -- constructor that creates an object with the
+        same value than calling `clear()`
+
+    @since version 1.0.0
+    */
+    void clear() noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = 0;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = 0;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = 0.0;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = false;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value.string->clear();
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value.binary->clear();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array->clear();
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object->clear();
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Appends the given element @a val to the end of the JSON value. If the
+    function is called on a JSON null value, an empty array is created before
+    appending @a val.
+
+    @param[in] val the value to add to the JSON array
+
+    @throw type_error.308 when called on a type other than JSON array or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON array. Note how the `null` value was silently
+    converted to a JSON array.,push_back}
+
+    @since version 1.0.0
+    */
+    void push_back(basic_json&& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (move semantics)
+        m_value.array->push_back(std::move(val));
+        // if val is moved from, basic_json move constructor marks it null so we do not call the destructor
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(basic_json&& val)
+    {
+        push_back(std::move(val));
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    void push_back(const basic_json& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array
+        m_value.array->push_back(val);
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(const basic_json& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    Inserts the given element @a val to the JSON object. If the function is
+    called on a JSON null value, an empty object is created before inserting
+    @a val.
+
+    @param[in] val the value to add to the JSON object
+
+    @throw type_error.308 when called on a type other than JSON object or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON object. Note how the `null` value was silently
+    converted to a JSON object.,push_back__object_t__value}
+
+    @since version 1.0.0
+    */
+    void push_back(const typename object_t::value_type& val)
+    {
+        // push_back only works for null objects or objects
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array
+        m_value.object->insert(val);
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(const typename object_t::value_type&)
+    */
+    reference operator+=(const typename object_t::value_type& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    This function allows to use `push_back` with an initializer list. In case
+
+    1. the current value is an object,
+    2. the initializer list @a init contains only two elements, and
+    3. the first element of @a init is a string,
+
+    @a init is converted into an object element and added using
+    @ref push_back(const typename object_t::value_type&). Otherwise, @a init
+    is converted to a JSON value and added using @ref push_back(basic_json&&).
+
+    @param[in] init  an initializer list
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @note This function is required to resolve an ambiguous overload error,
+          because pairs like `{"key", "value"}` can be both interpreted as
+          `object_t::value_type` or `std::initializer_list<basic_json>`, see
+          https://github.com/nlohmann/json/issues/235 for more information.
+
+    @liveexample{The example shows how initializer lists are treated as
+    objects when possible.,push_back__initializer_list}
+    */
+    void push_back(initializer_list_t init)
+    {
+        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
+        {
+            basic_json&& key = init.begin()->moved_or_copied();
+            push_back(typename object_t::value_type(
+                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
+        }
+        else
+        {
+            push_back(basic_json(init));
+        }
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(initializer_list_t)
+    */
+    reference operator+=(initializer_list_t init)
+    {
+        push_back(init);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Creates a JSON value from the passed parameters @a args to the end of the
+    JSON value. If the function is called on a JSON null value, an empty array
+    is created before appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return reference to the inserted element
+
+    @throw type_error.311 when called on a type other than JSON array or
+    null; example: `"cannot use emplace_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` can be used to add
+    elements to a JSON array. Note how the `null` value was silently converted
+    to a JSON array.,emplace_back}
+
+    @since version 2.0.8, returns reference since 3.7.0
+    */
+    template<class... Args>
+    reference emplace_back(Args&& ... args)
+    {
+        // emplace_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+#ifdef JSON_HAS_CPP_17
+        return m_value.array->emplace_back(std::forward<Args>(args)...);
+#else
+        m_value.array->emplace_back(std::forward<Args>(args)...);
+        return m_value.array->back();
+#endif
+    }
+
+    /*!
+    @brief add an object to an object if key does not exist
+
+    Inserts a new element into a JSON object constructed in-place with the
+    given @a args if there is no element with the key in the container. If the
+    function is called on a JSON null value, an empty object is created before
+    appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return a pair consisting of an iterator to the inserted element, or the
+            already-existing element if no insertion happened, and a bool
+            denoting whether the insertion took place.
+
+    @throw type_error.311 when called on a type other than JSON object or
+    null; example: `"cannot use emplace() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `emplace()` can be used to add elements
+    to a JSON object. Note how the `null` value was silently converted to a
+    JSON object. Further note how no value is added if there was already one
+    value stored with the same key.,emplace}
+
+    @since version 2.0.8
+    */
+    template<class... Args>
+    std::pair<iterator, bool> emplace(Args&& ... args)
+    {
+        // emplace only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace() with " + std::string(type_name())));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        auto res = m_value.object->emplace(std::forward<Args>(args)...);
+        // create result iterator and set iterator to the result of emplace
+        auto it = begin();
+        it.m_it.object_iterator = res.first;
+
+        // return pair of iterator and boolean
+        return {it, res.second};
+    }
+
+    /// Helper for insertion of an iterator
+    /// @note: This uses std::distance to support GCC 4.8,
+    ///        see https://github.com/nlohmann/json/pull/1257
+    template<typename... Args>
+    iterator insert_iterator(const_iterator pos, Args&& ... args)
+    {
+        iterator result(this);
+        JSON_ASSERT(m_value.array != nullptr);
+
+        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
+        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_value.array->begin() + insert_pos;
+
+        // This could have been written as:
+        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
+
+        return result;
+    }
+
+    /*!
+    @brief inserts element
+
+    Inserts element @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] val element to insert
+    @return iterator pointing to the inserted @a val.
+
+    @throw type_error.309 if called on JSON values other than arrays;
+    example: `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Constant plus linear in the distance between @a pos and end of
+    the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief inserts element
+    @copydoc insert(const_iterator, const basic_json&)
+    */
+    iterator insert(const_iterator pos, basic_json&& val)
+    {
+        return insert(pos, val);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts @a cnt copies of @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] cnt number of copies of @a val to insert
+    @param[in] val element to insert
+    @return iterator pointing to the first element inserted, or @a pos if
+    `cnt==0`
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Linear in @a cnt plus linear in the distance between @a pos
+    and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__count}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, cnt, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)` before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+    @throw invalid_iterator.211 if @a first or @a last are iterators into
+    container for which insert is called; example: `"passed iterators may not
+    belong to container"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `first==last`
+
+    @complexity Linear in `std::distance(first, last)` plus linear in the
+    distance between @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
+        {
+            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container"));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from initializer list @a ilist before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] ilist initializer list to insert the values from
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `ilist` is empty
+
+    @complexity Linear in `ilist.size()` plus linear in the distance between
+    @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__ilist}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, initializer_list_t ilist)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, ilist.begin(), ilist.end());
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)`.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than objects; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity Logarithmic: `O(N*log(size() + N))`, where `N` is the number
+    of elements to insert.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range_object}
+
+    @since version 3.0.0
+    */
+    void insert(const_iterator first, const_iterator last)
+    {
+        // insert only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
+        }
+
+        m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from JSON object @a j and overwrites existing keys.
+
+    @param[in] j  JSON object to read values from
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_reference j)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
+        }
+        if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name())));
+        }
+
+        for (auto it = j.cbegin(); it != j.cend(); ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from from range `[first, last)` and overwrites existing
+    keys.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used__range.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_iterator first, const_iterator last)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()
+                                 || !last.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
+        }
+
+        for (auto it = first; it != last; ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be swapped with
+    `swap()`.,swap__reference}
+
+    @since version 1.0.0
+    */
+    void swap(reference other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        std::swap(m_type, other.m_type);
+        std::swap(m_value, other.m_value);
+        assert_invariant();
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value from @a left with those of @a right. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated. implemented as a friend function callable via ADL.
+
+    @param[in,out] left JSON value to exchange the contents with
+    @param[in,out] right JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be swapped with
+    `swap()`.,swap__reference}
+
+    @since version 1.0.0
+    */
+    friend void swap(reference left, reference right) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        left.swap(right);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON array with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other array to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an array; example: `"cannot
+    use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how arrays can be swapped with
+    `swap()`.,swap__array_t}
+
+    @since version 1.0.0
+    */
+    void swap(array_t& other)
+    {
+        // swap only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            std::swap(*(m_value.array), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON object with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other object to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an object; example:
+    `"cannot use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how objects can be swapped with
+    `swap()`.,swap__object_t}
+
+    @since version 1.0.0
+    */
+    void swap(object_t& other)
+    {
+        // swap only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            std::swap(*(m_value.object), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other string to exchange the contents with
+
+    @throw type_error.310 when JSON value is not a string; example: `"cannot
+    use swap() with boolean"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how strings can be swapped with
+    `swap()`.,swap__string_t}
+
+    @since version 1.0.0
+    */
+    void swap(string_t& other)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_string()))
+        {
+            std::swap(*(m_value.string), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other binary to exchange the contents with
+
+    @throw type_error.310 when JSON value is not a string; example: `"cannot
+    use swap() with boolean"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how strings can be swapped with
+    `swap()`.,swap__binary_t}
+
+    @since version 3.8.0
+    */
+    void swap(binary_t& other)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            std::swap(*(m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /// @copydoc swap(binary_t)
+    void swap(typename binary_t::container_type& other)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            std::swap(*(m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /// @}
+
+  public:
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+    /*!
+    @brief comparison: equal
+
+    Compares two JSON values for equality according to the following rules:
+    - Two JSON values are equal if (1) they are from the same type and (2)
+      their stored values are the same according to their respective
+      `operator==`.
+    - Integer and floating-point numbers are automatically converted before
+      comparison. Note that two NaN values are always treated as unequal.
+    - Two JSON null values are equal.
+
+    @note Floating-point inside JSON values numbers are compared with
+    `json::number_float_t::operator==` which is `double::operator==` by
+    default. To compare floating-point while respecting an epsilon, an alternative
+    [comparison function](https://github.com/mariokonrad/marnav/blob/master/include/marnav/math/floatingpoint.hpp#L34-#L39)
+    could be used, for instance
+    @code {.cpp}
+    template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
+    inline bool is_same(T a, T b, T epsilon = std::numeric_limits<T>::epsilon()) noexcept
+    {
+        return std::abs(a - b) <= epsilon;
+    }
+    @endcode
+    Or you can self-defined operator equal function like this:
+    @code {.cpp}
+    bool my_equal(const_reference lhs, const_reference rhs) {
+    const auto lhs_type lhs.type();
+    const auto rhs_type rhs.type();
+    if (lhs_type == rhs_type) {
+        switch(lhs_type)
+            // self_defined case
+            case value_t::number_float:
+                return std::abs(lhs - rhs) <= std::numeric_limits<float>::epsilon();
+            // other cases remain the same with the original
+            ...
+    }
+    ...
+    }
+    @endcode
+
+    @note NaN values never compare equal to themselves or to other NaN values.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are equal
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__equal}
+
+    @since version 1.0.0
+    */
+    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    return *lhs.m_value.array == *rhs.m_value.array;
+
+                case value_t::object:
+                    return *lhs.m_value.object == *rhs.m_value.object;
+
+                case value_t::null:
+                    return true;
+
+                case value_t::string:
+                    return *lhs.m_value.string == *rhs.m_value.string;
+
+                case value_t::boolean:
+                    return lhs.m_value.boolean == rhs.m_value.boolean;
+
+                case value_t::number_integer:
+                    return lhs.m_value.number_integer == rhs.m_value.number_integer;
+
+                case value_t::number_unsigned:
+                    return lhs.m_value.number_unsigned == rhs.m_value.number_unsigned;
+
+                case value_t::number_float:
+                    return lhs.m_value.number_float == rhs.m_value.number_float;
+
+                case value_t::binary:
+                    return *lhs.m_value.binary == *rhs.m_value.binary;
+
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_integer;
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer == static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+
+        return false;
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs == basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) == rhs;
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are not equal
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__notequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs != basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) != rhs;
+    }
+
+    /*!
+    @brief comparison: less than
+
+    Compares whether one JSON value @a lhs is less than another JSON value @a
+    rhs according to the following rules:
+    - If @a lhs and @a rhs have the same type, the values are compared using
+      the default `<` operator.
+    - Integer and floating-point numbers are automatically converted before
+      comparison
+    - In case @a lhs and @a rhs have different types, the values are ignored
+      and the order of the types is considered, see
+      @ref operator<(const value_t, const value_t).
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__less}
+
+    @since version 1.0.0
+    */
+    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    // note parentheses are necessary, see
+                    // https://github.com/nlohmann/json/issues/1530
+                    return (*lhs.m_value.array) < (*rhs.m_value.array);
+
+                case value_t::object:
+                    return (*lhs.m_value.object) < (*rhs.m_value.object);
+
+                case value_t::null:
+                    return false;
+
+                case value_t::string:
+                    return (*lhs.m_value.string) < (*rhs.m_value.string);
+
+                case value_t::boolean:
+                    return (lhs.m_value.boolean) < (rhs.m_value.boolean);
+
+                case value_t::number_integer:
+                    return (lhs.m_value.number_integer) < (rhs.m_value.number_integer);
+
+                case value_t::number_unsigned:
+                    return (lhs.m_value.number_unsigned) < (rhs.m_value.number_unsigned);
+
+                case value_t::number_float:
+                    return (lhs.m_value.number_float) < (rhs.m_value.number_float);
+
+                case value_t::binary:
+                    return (*lhs.m_value.binary) < (*rhs.m_value.binary);
+
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer < static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_integer;
+        }
+
+        // We only reach this line if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        return operator<(lhs_type, rhs_type);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs < basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) < rhs;
+    }
+
+    /*!
+    @brief comparison: less than or equal
+
+    Compares whether one JSON value @a lhs is less than or equal to another
+    JSON value by calculating `not (rhs < lhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greater}
+
+    @since version 1.0.0
+    */
+    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(rhs < lhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs <= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) <= rhs;
+    }
+
+    /*!
+    @brief comparison: greater than
+
+    Compares whether one JSON value @a lhs is greater than another
+    JSON value by calculating `not (lhs <= rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__lessequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs <= rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs > basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) > rhs;
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+
+    Compares whether one JSON value @a lhs is greater than or equal to another
+    JSON value by calculating `not (lhs < rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greaterequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs < rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs >= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) >= rhs;
+    }
+
+    /// @}
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+
+    /*!
+    @brief serialize to stream
+
+    Serialize the given JSON value @a j to the output stream @a o. The JSON
+    value will be serialized using the @ref dump member function.
+
+    - The indentation of the output can be controlled with the member variable
+      `width` of the output stream @a o. For instance, using the manipulator
+      `std::setw(4)` on @a o sets the indentation level to `4` and the
+      serialization result is the same as calling `dump(4)`.
+
+    - The indentation character can be controlled with the member variable
+      `fill` of the output stream @a o. For instance, the manipulator
+      `std::setfill('\\t')` sets indentation to use a tab character rather than
+      the default space character.
+
+    @param[in,out] o  stream to serialize to
+    @param[in] j  JSON value to serialize
+
+    @return the stream @a o
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded
+
+    @complexity Linear.
+
+    @liveexample{The example below shows the serialization with different
+    parameters to `width` to adjust the indentation level.,operator_serialize}
+
+    @since version 1.0.0; indentation character added in version 3.0.0
+    */
+    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
+    {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = o.width() > 0;
+        const auto indentation = pretty_print ? o.width() : 0;
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        serializer s(detail::output_adapter<char>(o), o.fill());
+        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /*!
+    @brief serialize to stream
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use
+                @ref operator<<(std::ostream&, const basic_json&)
+                instead; that is, replace calls like `j >> o;` with `o << j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
+    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
+    {
+        return o << j;
+    }
+
+    /// @}
+
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /*!
+    @brief deserialize from a compatible input
+
+    @tparam InputType A compatible input, for instance
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i  input to read from
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb or reading from the input @a i has a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from an array.,parse__array__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__string__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__istream__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from a contiguous container.,parse__contiguouscontainer__parser_callback_t}
+
+    @since version 2.0.3 (contiguous containers); version 3.9.0 allowed to
+    ignore comments.
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(InputType&& i,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    /*!
+    @brief deserialize from a pair of character iterators
+
+    The value_type of the iterator must be a integral type with size of 1, 2 or
+    4 bytes, which will be interpreted respectively as UTF-8, UTF-16 and UTF-32.
+
+    @param[in] first iterator to start of character range
+    @param[in] last  iterator to end of character range
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(IteratorType first,
+                            IteratorType last,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
+    static basic_json parse(detail::span_input_adapter&& i,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    /*!
+    @brief check if the input is valid JSON
+
+    Unlike the @ref parse(InputType&&, const parser_callback_t,const bool)
+    function, this function neither throws an exception in case of invalid JSON
+    input (i.e., a parse error) nor creates diagnostic information.
+
+    @tparam InputType A compatible input, for instance
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i input to read from
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return Whether the input read from @a i is valid JSON.
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `accept()` function reading
+    from a string.,accept__string}
+    */
+    template<typename InputType>
+    static bool accept(InputType&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    template<typename IteratorType>
+    static bool accept(IteratorType first, IteratorType last,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
+    static bool accept(detail::span_input_adapter&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
+    }
+
+    /*!
+    @brief generate SAX events
+
+    The SAX event lister must follow the interface of @ref json_sax.
+
+    This function reads from a compatible input. Examples are:
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i  input to read from
+    @param[in,out] sax  SAX event listener
+    @param[in] format  the format to parse (JSON, CBOR, MessagePack, or UBJSON)
+    @param[in] strict  whether the input has to be consumed completely
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default); only applies to the JSON file format.
+
+    @return return value of the last processed SAX event
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the SAX consumer @a sax has
+    a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `sax_parse()` function
+    reading from string and processing the events with a user-defined SAX
+    event consumer.,sax_parse}
+
+    @since version 3.2.0
+    */
+    template <typename InputType, typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(InputType&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    template<class IteratorType, class SAX>
+    JSON_HEDLEY_NON_NULL(3)
+    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    template <typename SAX>
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = i.get();
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    /*!
+    @brief deserialize from stream
+    @deprecated This stream operator is deprecated and will be removed in
+                version 4.0.0 of the library. Please use
+                @ref operator>>(std::istream&, basic_json&)
+                instead; that is, replace calls like `j << i;` with `i >> j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
+    friend std::istream& operator<<(basic_json& j, std::istream& i)
+    {
+        return operator>>(i, j);
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    Deserializes an input stream to a JSON value.
+
+    @param[in,out] i  input stream to read a serialized JSON value from
+    @param[in,out] j  JSON value to write the deserialized input to
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below shows how a JSON value is constructed by
+    reading a serialization from a stream.,operator_deserialize}
+
+    @sa parse(std::istream&, const parser_callback_t) for a variant with a
+    parser callback function to filter values while parsing
+
+    @since version 1.0.0
+    */
+    friend std::istream& operator>>(std::istream& i, basic_json& j)
+    {
+        parser(detail::input_adapter(i)).parse(false, j);
+        return i;
+    }
+
+    /// @}
+
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /*!
+    @brief return the type as string
+
+    Returns the type name as string to be used in error messages - usually to
+    indicate that a function was called on a wrong JSON type.
+
+    @return a string representation of a the @a m_type member:
+            Value type  | return value
+            ----------- | -------------
+            null        | `"null"`
+            boolean     | `"boolean"`
+            string      | `"string"`
+            number      | `"number"` (for all number types)
+            object      | `"object"`
+            array       | `"array"`
+            binary      | `"binary"`
+            discarded   | `"discarded"`
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies `type_name()` for all JSON
+    types.,type_name}
+
+    @sa @ref type() -- return the type of the JSON value
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
+
+    @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept`
+    since 3.0.0
+    */
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* type_name() const noexcept
+    {
+        {
+            switch (m_type)
+            {
+                case value_t::null:
+                    return "null";
+                case value_t::object:
+                    return "object";
+                case value_t::array:
+                    return "array";
+                case value_t::string:
+                    return "string";
+                case value_t::boolean:
+                    return "boolean";
+                case value_t::binary:
+                    return "binary";
+                case value_t::discarded:
+                    return "discarded";
+                default:
+                    return "number";
+            }
+        }
+    }
+
+
+  private:
+    //////////////////////
+    // member variables //
+    //////////////////////
+
+    /// the type of the current element
+    value_t m_type = value_t::null;
+
+    /// the value of the current element
+    json_value m_value = {};
+
+    //////////////////////////////////////////
+    // binary serialization/deserialization //
+    //////////////////////////////////////////
+
+    /// @name binary serialization/deserialization support
+    /// @{
+
+  public:
+    /*!
+    @brief create a CBOR serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the CBOR (Concise
+    Binary Object Representation) serialization format. CBOR is a binary
+    serialization format which aims to be more compact than JSON itself, yet
+    more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    CBOR types according to the CBOR specification (RFC 7049):
+
+    JSON value type | value/range                                | CBOR type                          | first byte
+    --------------- | ------------------------------------------ | ---------------------------------- | ---------------
+    null            | `null`                                     | Null                               | 0xF6
+    boolean         | `true`                                     | True                               | 0xF5
+    boolean         | `false`                                    | False                              | 0xF4
+    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3B
+    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3A
+    number_integer  | -32768..-129                               | Negative integer (2 bytes follow)  | 0x39
+    number_integer  | -128..-25                                  | Negative integer (1 byte follow)   | 0x38
+    number_integer  | -24..-1                                    | Negative integer                   | 0x20..0x37
+    number_integer  | 0..23                                      | Integer                            | 0x00..0x17
+    number_integer  | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_integer  | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_unsigned | 0..23                                      | Integer                            | 0x00..0x17
+    number_unsigned | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_unsigned | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_float    | *any value representable by a float*       | Single-Precision Float             | 0xFA
+    number_float    | *any value NOT representable by a float*   | Double-Precision Float             | 0xFB
+    string          | *length*: 0..23                            | UTF-8 string                       | 0x60..0x77
+    string          | *length*: 23..255                          | UTF-8 string (1 byte follow)       | 0x78
+    string          | *length*: 256..65535                       | UTF-8 string (2 bytes follow)      | 0x79
+    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7A
+    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7B
+    array           | *size*: 0..23                              | array                              | 0x80..0x97
+    array           | *size*: 23..255                            | array (1 byte follow)              | 0x98
+    array           | *size*: 256..65535                         | array (2 bytes follow)             | 0x99
+    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9A
+    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9B
+    object          | *size*: 0..23                              | map                                | 0xA0..0xB7
+    object          | *size*: 23..255                            | map (1 byte follow)                | 0xB8
+    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xB9
+    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xBA
+    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xBB
+    binary          | *size*: 0..23                              | byte string                        | 0x40..0x57
+    binary          | *size*: 23..255                            | byte string (1 byte follow)        | 0x58
+    binary          | *size*: 256..65535                         | byte string (2 bytes follow)       | 0x59
+    binary          | *size*: 65536..4294967295                  | byte string (4 bytes follow)       | 0x5A
+    binary          | *size*: 4294967296..18446744073709551615   | byte string (8 bytes follow)       | 0x5B
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a CBOR value.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The following CBOR types are not used in the conversion:
+          - UTF-8 strings terminated by "break" (0x7F)
+          - arrays terminated by "break" (0x9F)
+          - maps terminated by "break" (0xBF)
+          - byte strings terminated by "break" (0x5F)
+          - date/time (0xC0..0xC1)
+          - bignum (0xC2..0xC3)
+          - decimal fraction (0xC4)
+          - bigfloat (0xC5)
+          - expected conversions (0xD5..0xD7)
+          - simple values (0xE0..0xF3, 0xF8)
+          - undefined (0xF7)
+          - half-precision floats (0xF9)
+          - break (0xFF)
+
+    @param[in] j  JSON value to serialize
+    @return CBOR serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in CBOR format.,to_cbor}
+
+    @sa http://cbor.io
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        analogous deserialization
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9; compact representation of floating-point numbers
+           since version 3.8.0
+    */
+    static std::vector<uint8_t> to_cbor(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_cbor(j, result);
+        return result;
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_cbor(j);
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_cbor(j);
+    }
+
+    /*!
+    @brief create a MessagePack serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the MessagePack
+    serialization format. MessagePack is a binary serialization format which
+    aims to be more compact than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    MessagePack types according to the MessagePack specification:
+
+    JSON value type | value/range                       | MessagePack type | first byte
+    --------------- | --------------------------------- | ---------------- | ----------
+    null            | `null`                            | nil              | 0xC0
+    boolean         | `true`                            | true             | 0xC3
+    boolean         | `false`                           | false            | 0xC2
+    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xD3
+    number_integer  | -2147483648..-32769               | int32            | 0xD2
+    number_integer  | -32768..-129                      | int16            | 0xD1
+    number_integer  | -128..-33                         | int8             | 0xD0
+    number_integer  | -32..-1                           | negative fixint  | 0xE0..0xFF
+    number_integer  | 0..127                            | positive fixint  | 0x00..0x7F
+    number_integer  | 128..255                          | uint 8           | 0xCC
+    number_integer  | 256..65535                        | uint 16          | 0xCD
+    number_integer  | 65536..4294967295                 | uint 32          | 0xCE
+    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7F
+    number_unsigned | 128..255                          | uint 8           | 0xCC
+    number_unsigned | 256..65535                        | uint 16          | 0xCD
+    number_unsigned | 65536..4294967295                 | uint 32          | 0xCE
+    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_float    | *any value representable by a float*     | float 32 | 0xCA
+    number_float    | *any value NOT representable by a float* | float 64 | 0xCB
+    string          | *length*: 0..31                   | fixstr           | 0xA0..0xBF
+    string          | *length*: 32..255                 | str 8            | 0xD9
+    string          | *length*: 256..65535              | str 16           | 0xDA
+    string          | *length*: 65536..4294967295       | str 32           | 0xDB
+    array           | *size*: 0..15                     | fixarray         | 0x90..0x9F
+    array           | *size*: 16..65535                 | array 16         | 0xDC
+    array           | *size*: 65536..4294967295         | array 32         | 0xDD
+    object          | *size*: 0..15                     | fix map          | 0x80..0x8F
+    object          | *size*: 16..65535                 | map 16           | 0xDE
+    object          | *size*: 65536..4294967295         | map 32           | 0xDF
+    binary          | *size*: 0..255                    | bin 8            | 0xC4
+    binary          | *size*: 256..65535                | bin 16           | 0xC5
+    binary          | *size*: 65536..4294967295         | bin 32           | 0xC6
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a MessagePack value.
+
+    @note The following values can **not** be converted to a MessagePack value:
+          - strings with more than 4294967295 bytes
+          - byte strings with more than 4294967295 bytes
+          - arrays with more than 4294967295 elements
+          - objects with more than 4294967295 elements
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @param[in] j  JSON value to serialize
+    @return MessagePack serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in MessagePack format.,to_msgpack}
+
+    @sa http://msgpack.org
+    @sa @ref from_msgpack for the analogous deserialization
+    @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9
+    */
+    static std::vector<uint8_t> to_msgpack(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_msgpack(j, result);
+        return result;
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_msgpack(j);
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_msgpack(j);
+    }
+
+    /*!
+    @brief create a UBJSON serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the UBJSON
+    (Universal Binary JSON) serialization format. UBJSON aims to be more compact
+    than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    UBJSON types according to the UBJSON specification:
+
+    JSON value type | value/range                       | UBJSON type | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | `Z`
+    boolean         | `true`                            | true        | `T`
+    boolean         | `false`                           | false       | `F`
+    number_integer  | -9223372036854775808..-2147483649 | int64       | `L`
+    number_integer  | -2147483648..-32769               | int32       | `l`
+    number_integer  | -32768..-129                      | int16       | `I`
+    number_integer  | -128..127                         | int8        | `i`
+    number_integer  | 128..255                          | uint8       | `U`
+    number_integer  | 256..32767                        | int16       | `I`
+    number_integer  | 32768..2147483647                 | int32       | `l`
+    number_integer  | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 0..127                            | int8        | `i`
+    number_unsigned | 128..255                          | uint8       | `U`
+    number_unsigned | 256..32767                        | int16       | `I`
+    number_unsigned | 32768..2147483647                 | int32       | `l`
+    number_unsigned | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 2147483649..18446744073709551615  | high-precision | `H`
+    number_float    | *any value*                       | float64     | `D`
+    string          | *with shortest length indicator*  | string      | `S`
+    array           | *see notes on optimized format*   | array       | `[`
+    object          | *see notes on optimized format*   | map         | `{`
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a UBJSON value.
+
+    @note The following values can **not** be converted to a UBJSON value:
+          - strings with more than 9223372036854775807 bytes (theoretical)
+
+    @note The following markers are not used in the conversion:
+          - `Z`: no-op values are not created.
+          - `C`: single-byte strings are serialized with `S` markers.
+
+    @note Any UBJSON output created @ref to_ubjson can be successfully parsed
+          by @ref from_ubjson.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The optimized formats for containers are supported: Parameter
+          @a use_size adds size information to the beginning of a container and
+          removes the closing marker. Parameter @a use_type further checks
+          whether all elements of a container have the same type and adds the
+          type marker to the beginning of the container. The @a use_type
+          parameter must only be used together with @a use_size = true. Note
+          that @a use_size = true alone may result in larger representations -
+          the benefit of this parameter is that the receiving side is
+          immediately informed on the number of elements of the container.
+
+    @note If the JSON data contains the binary type, the value stored is a list
+          of integers, as suggested by the UBJSON documentation.  In particular,
+          this means that serialization and the deserialization of a JSON
+          containing binary values into UBJSON and back will result in a
+          different JSON object.
+
+    @param[in] j  JSON value to serialize
+    @param[in] use_size  whether to add size annotations to container types
+    @param[in] use_type  whether to add type annotations to container types
+                         (must be combined with @a use_size = true)
+    @return UBJSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in UBJSON format.,to_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        analogous deserialization
+    @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+
+    @since version 3.1.0
+    */
+    static std::vector<uint8_t> to_ubjson(const basic_json& j,
+                                          const bool use_size = false,
+                                          const bool use_type = false)
+    {
+        std::vector<uint8_t> result;
+        to_ubjson(j, result, use_size, use_type);
+        return result;
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<uint8_t> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<uint8_t>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
+    }
+
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and returns a vector
+           containing the corresponding BSON-representation.
+
+    BSON (Binary JSON) is a binary format in which zero or more ordered key/value pairs are
+    stored as a single entity (a so-called document).
+
+    The library uses the following mapping from JSON values types to BSON types:
+
+    JSON value type | value/range                       | BSON type   | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | 0x0A
+    boolean         | `true`, `false`                   | boolean     | 0x08
+    number_integer  | -9223372036854775808..-2147483649 | int64       | 0x12
+    number_integer  | -2147483648..2147483647           | int32       | 0x10
+    number_integer  | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 0..2147483647                     | int32       | 0x10
+    number_unsigned | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 9223372036854775808..18446744073709551615| --   | --
+    number_float    | *any value*                       | double      | 0x01
+    string          | *any value*                       | string      | 0x02
+    array           | *any value*                       | document    | 0x04
+    object          | *any value*                       | document    | 0x03
+    binary          | *any value*                       | binary      | 0x05
+
+    @warning The mapping is **incomplete**, since only JSON-objects (and things
+    contained therein) can be serialized to BSON.
+    Also, integers larger than 9223372036854775807 cannot be serialized to BSON,
+    and the keys may not contain U+0000, since they are serialized a
+    zero-terminated c-strings.
+
+    @throw out_of_range.407  if `j.is_number_unsigned() && j.get<std::uint64_t>() > 9223372036854775807`
+    @throw out_of_range.409  if a key in `j` contains a NULL (U+0000)
+    @throw type_error.317    if `!j.is_object()`
+
+    @pre The input `j` is required to be an object: `j.is_object() == true`.
+
+    @note Any BSON output created via @ref to_bson can be successfully parsed
+          by @ref from_bson.
+
+    @param[in] j  JSON value to serialize
+    @return BSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in BSON format.,to_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa @ref from_bson(detail::input_adapter&&, const bool strict) for the
+        analogous deserialization
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+    @sa @ref to_cbor(const basic_json&) for the related CBOR format
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+    */
+    static std::vector<uint8_t> to_bson(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_bson(j, result);
+        return result;
+    }
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and forwards the
+           corresponding BSON-representation to the given output_adapter `o`.
+    @param j The JSON object to convert to BSON.
+    @param o The output adapter that receives the binary BSON representation.
+    @pre The input `j` shall be an object: `j.is_object() == true`
+    @sa @ref to_bson(const basic_json&)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_bson(j);
+    }
+
+    /*!
+    @copydoc to_bson(const basic_json&, detail::output_adapter<uint8_t>)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_bson(j);
+    }
+
+
+    /*!
+    @brief create a JSON value from an input in CBOR format
+
+    Deserializes a given input @a i to a JSON value using the CBOR (Concise
+    Binary Object Representation) serialization format.
+
+    The library maps CBOR types to JSON value types as follows:
+
+    CBOR type              | JSON value type | first byte
+    ---------------------- | --------------- | ----------
+    Integer                | number_unsigned | 0x00..0x17
+    Unsigned integer       | number_unsigned | 0x18
+    Unsigned integer       | number_unsigned | 0x19
+    Unsigned integer       | number_unsigned | 0x1A
+    Unsigned integer       | number_unsigned | 0x1B
+    Negative integer       | number_integer  | 0x20..0x37
+    Negative integer       | number_integer  | 0x38
+    Negative integer       | number_integer  | 0x39
+    Negative integer       | number_integer  | 0x3A
+    Negative integer       | number_integer  | 0x3B
+    Byte string            | binary          | 0x40..0x57
+    Byte string            | binary          | 0x58
+    Byte string            | binary          | 0x59
+    Byte string            | binary          | 0x5A
+    Byte string            | binary          | 0x5B
+    UTF-8 string           | string          | 0x60..0x77
+    UTF-8 string           | string          | 0x78
+    UTF-8 string           | string          | 0x79
+    UTF-8 string           | string          | 0x7A
+    UTF-8 string           | string          | 0x7B
+    UTF-8 string           | string          | 0x7F
+    array                  | array           | 0x80..0x97
+    array                  | array           | 0x98
+    array                  | array           | 0x99
+    array                  | array           | 0x9A
+    array                  | array           | 0x9B
+    array                  | array           | 0x9F
+    map                    | object          | 0xA0..0xB7
+    map                    | object          | 0xB8
+    map                    | object          | 0xB9
+    map                    | object          | 0xBA
+    map                    | object          | 0xBB
+    map                    | object          | 0xBF
+    False                  | `false`         | 0xF4
+    True                   | `true`          | 0xF5
+    Null                   | `null`          | 0xF6
+    Half-Precision Float   | number_float    | 0xF9
+    Single-Precision Float | number_float    | 0xFA
+    Double-Precision Float | number_float    | 0xFB
+
+    @warning The mapping is **incomplete** in the sense that not all CBOR
+             types can be converted to a JSON value. The following CBOR types
+             are not supported and will yield parse errors (parse_error.112):
+             - date/time (0xC0..0xC1)
+             - bignum (0xC2..0xC3)
+             - decimal fraction (0xC4)
+             - bigfloat (0xC5)
+             - expected conversions (0xD5..0xD7)
+             - simple values (0xE0..0xF3, 0xF8)
+             - undefined (0xF7)
+
+    @warning CBOR allows map keys of any type, whereas JSON only allows
+             strings as keys in object values. Therefore, CBOR maps with keys
+             other than UTF-8 strings are rejected (parse_error.113).
+
+    @note Any CBOR output created @ref to_cbor can be successfully parsed by
+          @ref from_cbor.
+
+    @param[in] i  an input in CBOR format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] tag_handler how to treat CBOR tags (optional, error by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from CBOR were
+    used in the given input @a v or if the input is not valid CBOR
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in CBOR
+    format to a JSON value.,from_cbor}
+
+    @sa http://cbor.io
+    @sa @ref to_cbor(const basic_json&) for the analogous serialization
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for the
+        related MessagePack format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        related UBJSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0; added @a tag_handler parameter since 3.9.0.
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
+    }
+
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @brief create a JSON value from an input in MessagePack format
+
+    Deserializes a given input @a i to a JSON value using the MessagePack
+    serialization format.
+
+    The library maps MessagePack types to JSON value types as follows:
+
+    MessagePack type | JSON value type | first byte
+    ---------------- | --------------- | ----------
+    positive fixint  | number_unsigned | 0x00..0x7F
+    fixmap           | object          | 0x80..0x8F
+    fixarray         | array           | 0x90..0x9F
+    fixstr           | string          | 0xA0..0xBF
+    nil              | `null`          | 0xC0
+    false            | `false`         | 0xC2
+    true             | `true`          | 0xC3
+    float 32         | number_float    | 0xCA
+    float 64         | number_float    | 0xCB
+    uint 8           | number_unsigned | 0xCC
+    uint 16          | number_unsigned | 0xCD
+    uint 32          | number_unsigned | 0xCE
+    uint 64          | number_unsigned | 0xCF
+    int 8            | number_integer  | 0xD0
+    int 16           | number_integer  | 0xD1
+    int 32           | number_integer  | 0xD2
+    int 64           | number_integer  | 0xD3
+    str 8            | string          | 0xD9
+    str 16           | string          | 0xDA
+    str 32           | string          | 0xDB
+    array 16         | array           | 0xDC
+    array 32         | array           | 0xDD
+    map 16           | object          | 0xDE
+    map 32           | object          | 0xDF
+    bin 8            | binary          | 0xC4
+    bin 16           | binary          | 0xC5
+    bin 32           | binary          | 0xC6
+    ext 8            | binary          | 0xC7
+    ext 16           | binary          | 0xC8
+    ext 32           | binary          | 0xC9
+    fixext 1         | binary          | 0xD4
+    fixext 2         | binary          | 0xD5
+    fixext 4         | binary          | 0xD6
+    fixext 8         | binary          | 0xD7
+    fixext 16        | binary          | 0xD8
+    negative fixint  | number_integer  | 0xE0-0xFF
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @param[in] i  an input in MessagePack format convertible to an input
+                  adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from MessagePack were
+    used in the given input @a i or if the input is not valid MessagePack
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    MessagePack format to a JSON value.,from_msgpack}
+
+    @sa http://msgpack.org
+    @sa @ref to_msgpack(const basic_json&) for the analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for
+        the related UBJSON format
+    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(InputType&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_msgpack(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(IteratorType first, IteratorType last,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(const T* ptr, std::size_t len,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(detail::span_input_adapter&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    /*!
+    @brief create a JSON value from an input in UBJSON format
+
+    Deserializes a given input @a i to a JSON value using the UBJSON (Universal
+    Binary JSON) serialization format.
+
+    The library maps UBJSON types to JSON value types as follows:
+
+    UBJSON type | JSON value type                         | marker
+    ----------- | --------------------------------------- | ------
+    no-op       | *no value, next value is read*          | `N`
+    null        | `null`                                  | `Z`
+    false       | `false`                                 | `F`
+    true        | `true`                                  | `T`
+    float32     | number_float                            | `d`
+    float64     | number_float                            | `D`
+    uint8       | number_unsigned                         | `U`
+    int8        | number_integer                          | `i`
+    int16       | number_integer                          | `I`
+    int32       | number_integer                          | `l`
+    int64       | number_integer                          | `L`
+    high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H'
+    string      | string                                  | `S`
+    char        | string                                  | `C`
+    array       | array (optimized values are supported)  | `[`
+    object      | object (optimized values are supported) | `{`
+
+    @note The mapping is **complete** in the sense that any UBJSON value can
+          be converted to a JSON value.
+
+    @param[in] i  an input in UBJSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if a parse error occurs
+    @throw parse_error.113 if a string could not be parsed successfully
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    UBJSON format to a JSON value.,from_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
+        the related MessagePack format
+    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 3.1.0; added @a allow_exceptions parameter since 3.2.0
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(InputType&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_ubjson(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(IteratorType first, IteratorType last,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(const T* ptr, std::size_t len,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(detail::span_input_adapter&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    /*!
+    @brief Create a JSON value from an input in BSON format
+
+    Deserializes a given input @a i to a JSON value using the BSON (Binary JSON)
+    serialization format.
+
+    The library maps BSON record types to JSON value types as follows:
+
+    BSON type       | BSON marker byte | JSON value type
+    --------------- | ---------------- | ---------------------------
+    double          | 0x01             | number_float
+    string          | 0x02             | string
+    document        | 0x03             | object
+    array           | 0x04             | array
+    binary          | 0x05             | still unsupported
+    undefined       | 0x06             | still unsupported
+    ObjectId        | 0x07             | still unsupported
+    boolean         | 0x08             | boolean
+    UTC Date-Time   | 0x09             | still unsupported
+    null            | 0x0A             | null
+    Regular Expr.   | 0x0B             | still unsupported
+    DB Pointer      | 0x0C             | still unsupported
+    JavaScript Code | 0x0D             | still unsupported
+    Symbol          | 0x0E             | still unsupported
+    JavaScript Code | 0x0F             | still unsupported
+    int32           | 0x10             | number_integer
+    Timestamp       | 0x11             | still unsupported
+    128-bit decimal float | 0x13       | still unsupported
+    Max Key         | 0x7F             | still unsupported
+    Min Key         | 0xFF             | still unsupported
+
+    @warning The mapping is **incomplete**. The unsupported mappings
+             are indicated in the table above.
+
+    @param[in] i  an input in BSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.114 if an unsupported BSON record type is encountered
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    BSON format to a JSON value.,from_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa @ref to_bson(const basic_json&) for the analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
+        the related MessagePack format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        related UBJSON format
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_bson(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        return from_bson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+    /// @}
+
+    //////////////////////////
+    // JSON Pointer support //
+    //////////////////////////
+
+    /// @name JSON Pointer functions
+    /// @{
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. Similar to @ref operator[](const typename
+    object_t::key_type&), `null` values are created in arrays and objects if
+    necessary.
+
+    In particular:
+    - If the JSON pointer points to an object key that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned.
+    - If the JSON pointer points to an array index that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned. All indices between the current maximum and the given
+      index are also filled with `null`.
+    - The special value `-` is treated as a synonym for the index past the
+      end.
+
+    @param[in] ptr  a JSON pointer
+
+    @return reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer}
+
+    @since version 2.0.0
+    */
+    reference operator[](const json_pointer& ptr)
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. The function does not change the JSON
+    value; no `null` values are created. In particular, the special value
+    `-` yields an exception.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return const reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer_const}
+
+    @since version 2.0.0
+    */
+    const_reference operator[](const json_pointer& ptr) const
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a reference to the element at with specified JSON pointer @a ptr,
+    with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer}
+    */
+    reference at(const json_pointer& ptr)
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a const reference to the element at with specified JSON pointer @a
+    ptr, with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer_const}
+    */
+    const_reference at(const json_pointer& ptr) const
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief return flattened JSON value
+
+    The function creates a JSON object whose keys are JSON pointers (see [RFC
+    6901](https://tools.ietf.org/html/rfc6901)) and whose values are all
+    primitive. The original JSON value can be restored using the @ref
+    unflatten() function.
+
+    @return an object that maps JSON pointers to primitive values
+
+    @note Empty objects and arrays are flattened to `null` and will not be
+          reconstructed correctly by the @ref unflatten() function.
+
+    @complexity Linear in the size the JSON value.
+
+    @liveexample{The following code shows how a JSON object is flattened to an
+    object whose keys consist of JSON pointers.,flatten}
+
+    @sa @ref unflatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json flatten() const
+    {
+        basic_json result(value_t::object);
+        json_pointer::flatten("", *this, result);
+        return result;
+    }
+
+    /*!
+    @brief unflatten a previously flattened JSON value
+
+    The function restores the arbitrary nesting of a JSON value that has been
+    flattened before using the @ref flatten() function. The JSON value must
+    meet certain constraints:
+    1. The value must be an object.
+    2. The keys must be JSON pointers (see
+       [RFC 6901](https://tools.ietf.org/html/rfc6901))
+    3. The mapped values must be primitive JSON types.
+
+    @return the original JSON from a flattened version
+
+    @note Empty objects and arrays are flattened by @ref flatten() to `null`
+          values and can not unflattened to their original type. Apart from
+          this example, for a JSON value `j`, the following is always true:
+          `j == j.flatten().unflatten()`.
+
+    @complexity Linear in the size the JSON value.
+
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+
+    @liveexample{The following code shows how a flattened JSON object is
+    unflattened into the original nested JSON object.,unflatten}
+
+    @sa @ref flatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json unflatten() const
+    {
+        return json_pointer::unflatten(*this);
+    }
+
+    /// @}
+
+    //////////////////////////
+    // JSON Patch functions //
+    //////////////////////////
+
+    /// @name JSON Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON patch
+
+    [JSON Patch](http://jsonpatch.com) defines a JSON document structure for
+    expressing a sequence of operations to apply to a JSON) document. With
+    this function, a JSON Patch is applied to the current JSON value by
+    executing all operations from the patch.
+
+    @param[in] json_patch  JSON patch document
+    @return patched document
+
+    @note The application of a patch is atomic: Either all operations succeed
+          and the patched document is returned or an exception is thrown. In
+          any case, the original value is not changed: the patch is applied
+          to a copy of the value.
+
+    @throw parse_error.104 if the JSON patch does not consist of an array of
+    objects
+
+    @throw parse_error.105 if the JSON patch is malformed (e.g., mandatory
+    attributes are missing); example: `"operation add must have member path"`
+
+    @throw out_of_range.401 if an array index is out of range.
+
+    @throw out_of_range.403 if a JSON pointer inside the patch could not be
+    resolved successfully in the current JSON value; example: `"key baz not
+    found"`
+
+    @throw out_of_range.405 if JSON pointer has no parent ("add", "remove",
+    "move")
+
+    @throw other_error.501 if "test" operation was unsuccessful
+
+    @complexity Linear in the size of the JSON value and the length of the
+    JSON patch. As usually only a fraction of the JSON value is affected by
+    the patch, the complexity can usually be neglected.
+
+    @liveexample{The following code shows how a JSON patch is applied to a
+    value.,patch}
+
+    @sa @ref diff -- create a JSON patch by comparing two JSON values
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+    @sa [RFC 6901 (JSON Pointer)](https://tools.ietf.org/html/rfc6901)
+
+    @since version 2.0.0
+    */
+    basic_json patch(const basic_json& json_patch) const
+    {
+        // make a working copy to apply the patch to
+        basic_json result = *this;
+
+        // the valid JSON Patch operations
+        enum class patch_operations {add, remove, replace, move, copy, test, invalid};
+
+        const auto get_op = [](const std::string & op)
+        {
+            if (op == "add")
+            {
+                return patch_operations::add;
+            }
+            if (op == "remove")
+            {
+                return patch_operations::remove;
+            }
+            if (op == "replace")
+            {
+                return patch_operations::replace;
+            }
+            if (op == "move")
+            {
+                return patch_operations::move;
+            }
+            if (op == "copy")
+            {
+                return patch_operations::copy;
+            }
+            if (op == "test")
+            {
+                return patch_operations::test;
+            }
+
+            return patch_operations::invalid;
+        };
+
+        // wrapper for "add" operation; add value at ptr
+        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
+        {
+            // adding to the root of the target document means replacing it
+            if (ptr.empty())
+            {
+                result = val;
+                return;
+            }
+
+            // make sure the top element of the pointer exists
+            json_pointer top_pointer = ptr.top();
+            if (top_pointer != ptr)
+            {
+                result.at(top_pointer);
+            }
+
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result[ptr];
+
+            switch (parent.m_type)
+            {
+                case value_t::null:
+                case value_t::object:
+                {
+                    // use operator[] to add value
+                    parent[last_path] = val;
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    if (last_path == "-")
+                    {
+                        // special case: append to back
+                        parent.push_back(val);
+                    }
+                    else
+                    {
+                        const auto idx = json_pointer::array_index(last_path);
+                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
+                        {
+                            // avoid undefined behavior
+                            JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+                        }
+
+                        // default case: insert add offset
+                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
+                    }
+                    break;
+                }
+
+                // if there exists a parent it cannot be primitive
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);  // LCOV_EXCL_LINE
+            }
+        };
+
+        // wrapper for "remove" operation; remove value at ptr
+        const auto operation_remove = [&result](json_pointer & ptr)
+        {
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result.at(ptr);
+
+            // remove child
+            if (parent.is_object())
+            {
+                // perform range check
+                auto it = parent.find(last_path);
+                if (JSON_HEDLEY_LIKELY(it != parent.end()))
+                {
+                    parent.erase(it);
+                }
+                else
+                {
+                    JSON_THROW(out_of_range::create(403, "key '" + last_path + "' not found"));
+                }
+            }
+            else if (parent.is_array())
+            {
+                // note erase performs range check
+                parent.erase(json_pointer::array_index(last_path));
+            }
+        };
+
+        // type check: top level value must be an array
+        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
+        {
+            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
+        }
+
+        // iterate and apply the operations
+        for (const auto& val : json_patch)
+        {
+            // wrapper to get a value for an operation
+            const auto get_value = [&val](const std::string & op,
+                                          const std::string & member,
+                                          bool string_type) -> basic_json &
+            {
+                // find value
+                auto it = val.m_value.object->find(member);
+
+                // context-sensitive error message
+                const auto error_msg = (op == "op") ? "operation" : "operation '" + op + "'";
+
+                // check if desired value is present
+                if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end()))
+                {
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have member '" + member + "'"));
+                }
+
+                // check if result is of type string
+                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
+                {
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have string member '" + member + "'"));
+                }
+
+                // no error: return value
+                return it->second;
+            };
+
+            // type check: every element of the array must be an object
+            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
+            {
+                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
+            }
+
+            // collect mandatory members
+            const auto op = get_value("op", "op", true).template get<std::string>();
+            const auto path = get_value(op, "path", true).template get<std::string>();
+            json_pointer ptr(path);
+
+            switch (get_op(op))
+            {
+                case patch_operations::add:
+                {
+                    operation_add(ptr, get_value("add", "value", false));
+                    break;
+                }
+
+                case patch_operations::remove:
+                {
+                    operation_remove(ptr);
+                    break;
+                }
+
+                case patch_operations::replace:
+                {
+                    // the "path" location must exist - use at()
+                    result.at(ptr) = get_value("replace", "value", false);
+                    break;
+                }
+
+                case patch_operations::move:
+                {
+                    const auto from_path = get_value("move", "from", true).template get<std::string>();
+                    json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The move operation is functionally identical to a
+                    // "remove" operation on the "from" location, followed
+                    // immediately by an "add" operation at the target
+                    // location with the value that was just removed.
+                    operation_remove(from_ptr);
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::copy:
+                {
+                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
+                    const json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The copy is functionally identical to an "add"
+                    // operation at the target location using the value
+                    // specified in the "from" member.
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::test:
+                {
+                    bool success = false;
+                    JSON_TRY
+                    {
+                        // check if "value" matches the one at "path"
+                        // the "path" location must exist - use at()
+                        success = (result.at(ptr) == get_value("test", "value", false));
+                    }
+                    JSON_INTERNAL_CATCH (out_of_range&)
+                    {
+                        // ignore out of range errors: success remains false
+                    }
+
+                    // throw an exception if test fails
+                    if (JSON_HEDLEY_UNLIKELY(!success))
+                    {
+                        JSON_THROW(other_error::create(501, "unsuccessful: " + val.dump()));
+                    }
+
+                    break;
+                }
+
+                default:
+                {
+                    // op must be "add", "remove", "replace", "move", "copy", or
+                    // "test"
+                    JSON_THROW(parse_error::create(105, 0, "operation value '" + op + "' is invalid"));
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief creates a diff as a JSON patch
+
+    Creates a [JSON Patch](http://jsonpatch.com) so that value @a source can
+    be changed into the value @a target by calling @ref patch function.
+
+    @invariant For two JSON values @a source and @a target, the following code
+    yields always `true`:
+    @code {.cpp}
+    source.patch(diff(source, target)) == target;
+    @endcode
+
+    @note Currently, only `remove`, `add`, and `replace` operations are
+          generated.
+
+    @param[in] source  JSON value to compare from
+    @param[in] target  JSON value to compare against
+    @param[in] path    helper value to create JSON pointers
+
+    @return a JSON patch to convert the @a source to @a target
+
+    @complexity Linear in the lengths of @a source and @a target.
+
+    @liveexample{The following code shows how a JSON patch is created as a
+    diff for two JSON values.,diff}
+
+    @sa @ref patch -- apply a JSON patch
+    @sa @ref merge_patch -- apply a JSON Merge Patch
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+
+    @since version 2.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json diff(const basic_json& source, const basic_json& target,
+                           const std::string& path = "")
+    {
+        // the patch
+        basic_json result(value_t::array);
+
+        // if the values are the same, return empty patch
+        if (source == target)
+        {
+            return result;
+        }
+
+        if (source.type() != target.type())
+        {
+            // different types: replace value
+            result.push_back(
+            {
+                {"op", "replace"}, {"path", path}, {"value", target}
+            });
+            return result;
+        }
+
+        switch (source.type())
+        {
+            case value_t::array:
+            {
+                // first pass: traverse common elements
+                std::size_t i = 0;
+                while (i < source.size() && i < target.size())
+                {
+                    // recursive call to compare array values at index i
+                    auto temp_diff = diff(source[i], target[i], path + "/" + std::to_string(i));
+                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    ++i;
+                }
+
+                // i now reached the end of at least one array
+                // in a second pass, traverse the remaining elements
+
+                // remove my remaining elements
+                const auto end_index = static_cast<difference_type>(result.size());
+                while (i < source.size())
+                {
+                    // add operations in reverse order to avoid invalid
+                    // indices
+                    result.insert(result.begin() + end_index, object(
+                    {
+                        {"op", "remove"},
+                        {"path", path + "/" + std::to_string(i)}
+                    }));
+                    ++i;
+                }
+
+                // add other remaining elements
+                while (i < target.size())
+                {
+                    result.push_back(
+                    {
+                        {"op", "add"},
+                        {"path", path + "/-"},
+                        {"value", target[i]}
+                    });
+                    ++i;
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // first pass: traverse this object's elements
+                for (auto it = source.cbegin(); it != source.cend(); ++it)
+                {
+                    // escape the key name to be used in a JSON patch
+                    const auto key = json_pointer::escape(it.key());
+
+                    if (target.find(it.key()) != target.end())
+                    {
+                        // recursive call to compare object values at key it
+                        auto temp_diff = diff(it.value(), target[it.key()], path + "/" + key);
+                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    }
+                    else
+                    {
+                        // found a key that is not in o -> remove it
+                        result.push_back(object(
+                        {
+                            {"op", "remove"}, {"path", path + "/" + key}
+                        }));
+                    }
+                }
+
+                // second pass: traverse other object's elements
+                for (auto it = target.cbegin(); it != target.cend(); ++it)
+                {
+                    if (source.find(it.key()) == source.end())
+                    {
+                        // found a key that is not in this -> add it
+                        const auto key = json_pointer::escape(it.key());
+                        result.push_back(
+                        {
+                            {"op", "add"}, {"path", path + "/" + key},
+                            {"value", it.value()}
+                        });
+                    }
+                }
+
+                break;
+            }
+
+            default:
+            {
+                // both primitive type: replace value
+                result.push_back(
+                {
+                    {"op", "replace"}, {"path", path}, {"value", target}
+                });
+                break;
+            }
+        }
+
+        return result;
+    }
+
+    /// @}
+
+    ////////////////////////////////
+    // JSON Merge Patch functions //
+    ////////////////////////////////
+
+    /// @name JSON Merge Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON Merge Patch
+
+    The merge patch format is primarily intended for use with the HTTP PATCH
+    method as a means of describing a set of modifications to a target
+    resource's content. This function applies a merge patch to the current
+    JSON value.
+
+    The function implements the following algorithm from Section 2 of
+    [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396):
+
+    ```
+    define MergePatch(Target, Patch):
+      if Patch is an Object:
+        if Target is not an Object:
+          Target = {} // Ignore the contents and set it to an empty Object
+        for each Name/Value pair in Patch:
+          if Value is null:
+            if Name exists in Target:
+              remove the Name/Value pair from Target
+          else:
+            Target[Name] = MergePatch(Target[Name], Value)
+        return Target
+      else:
+        return Patch
+    ```
+
+    Thereby, `Target` is the current object; that is, the patch is applied to
+    the current value.
+
+    @param[in] apply_patch  the patch to apply
+
+    @complexity Linear in the lengths of @a patch.
+
+    @liveexample{The following code shows how a JSON Merge Patch is applied to
+    a JSON document.,merge_patch}
+
+    @sa @ref patch -- apply a JSON patch
+    @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396)
+
+    @since version 3.0.0
+    */
+    void merge_patch(const basic_json& apply_patch)
+    {
+        if (apply_patch.is_object())
+        {
+            if (!is_object())
+            {
+                *this = object();
+            }
+            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
+            {
+                if (it.value().is_null())
+                {
+                    erase(it.key());
+                }
+                else
+                {
+                    operator[](it.key()).merge_patch(it.value());
+                }
+            }
+        }
+        else
+        {
+            *this = apply_patch;
+        }
+    }
+
+    /// @}
+};
+
+/*!
+@brief user-defined to_string function for JSON values
+
+This function implements a user-defined to_string  for JSON objects.
+
+@param[in] j  a JSON object
+@return a std::string object
+*/
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
+{
+    return j.dump();
+}
+} // namespace nlohmann
+
+///////////////////////
+// nonmember support //
+///////////////////////
+
+// specialization of std::swap, and std::hash
+namespace std
+{
+
+/// hash value for JSON objects
+template<>
+struct hash<nlohmann::json>
+{
+    /*!
+    @brief return a hash value for a JSON object
+
+    @since version 1.0.0
+    */
+    std::size_t operator()(const nlohmann::json& j) const
+    {
+        return nlohmann::detail::hash(j);
+    }
+};
+
+/// specialization for std::less<value_t>
+/// @note: do not remove the space after '<',
+///        see https://github.com/nlohmann/json/pull/679
+template<>
+struct less<::nlohmann::detail::value_t>
+{
+    /*!
+    @brief compare two value_t enum values
+    @since version 3.0.0
+    */
+    bool operator()(nlohmann::detail::value_t lhs,
+                    nlohmann::detail::value_t rhs) const noexcept
+    {
+        return nlohmann::detail::operator<(lhs, rhs);
+    }
+};
+
+// C++20 prohibit function specialization in the std namespace.
+#ifndef JSON_HAS_CPP_20
+
+/*!
+@brief exchanges the values of two JSON objects
+
+@since version 1.0.0
+*/
+template<>
+inline void swap<nlohmann::json>(nlohmann::json& j1, nlohmann::json& j2) noexcept(
+    is_nothrow_move_constructible<nlohmann::json>::value&&
+    is_nothrow_move_assignable<nlohmann::json>::value
+                              )
+{
+    j1.swap(j2);
+}
+
+#endif
+
+} // namespace std
+
+/*!
+@brief user-defined string literal for JSON values
+
+This operator implements a user-defined string literal for JSON objects. It
+can be used by adding `"_json"` to a string literal and returns a JSON object
+if no parse error occurred.
+
+@param[in] s  a string representation of a JSON object
+@param[in] n  the length of string @a s
+@return a JSON object
+
+@since version 1.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+{
+    return nlohmann::json::parse(s, s + n);
+}
+
+/*!
+@brief user-defined string literal for JSON pointer
+
+This operator implements a user-defined string literal for JSON Pointers. It
+can be used by adding `"_json_pointer"` to a string literal and returns a JSON pointer
+object if no parse error occurred.
+
+@param[in] s  a string representation of a JSON Pointer
+@param[in] n  the length of string @a s
+@return a JSON pointer object
+
+@since version 2.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+{
+    return nlohmann::json::json_pointer(std::string(s, n));
+}
+
+// #include <nlohmann/detail/macro_unscope.hpp>
+
+
+// restore GCC/clang diagnostic settings
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+    #pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+    #pragma GCC diagnostic pop
+#endif
+
+// clean up
+#undef JSON_ASSERT
+#undef JSON_INTERNAL_CATCH
+#undef JSON_CATCH
+#undef JSON_THROW
+#undef JSON_TRY
+#undef JSON_HAS_CPP_14
+#undef JSON_HAS_CPP_17
+#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
+#undef NLOHMANN_BASIC_JSON_TPL
+#undef JSON_EXPLICIT
+
+// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
+#undef JSON_HEDLEY_ALWAYS_INLINE
+#undef JSON_HEDLEY_ARM_VERSION
+#undef JSON_HEDLEY_ARM_VERSION_CHECK
+#undef JSON_HEDLEY_ARRAY_PARAM
+#undef JSON_HEDLEY_ASSUME
+#undef JSON_HEDLEY_BEGIN_C_DECLS
+#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#undef JSON_HEDLEY_CLANG_HAS_WARNING
+#undef JSON_HEDLEY_COMPCERT_VERSION
+#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#undef JSON_HEDLEY_CONCAT
+#undef JSON_HEDLEY_CONCAT3
+#undef JSON_HEDLEY_CONCAT3_EX
+#undef JSON_HEDLEY_CONCAT_EX
+#undef JSON_HEDLEY_CONST
+#undef JSON_HEDLEY_CONSTEXPR
+#undef JSON_HEDLEY_CONST_CAST
+#undef JSON_HEDLEY_CPP_CAST
+#undef JSON_HEDLEY_CRAY_VERSION
+#undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#undef JSON_HEDLEY_C_DECL
+#undef JSON_HEDLEY_DEPRECATED
+#undef JSON_HEDLEY_DEPRECATED_FOR
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#undef JSON_HEDLEY_DIAGNOSTIC_POP
+#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#undef JSON_HEDLEY_DMC_VERSION
+#undef JSON_HEDLEY_DMC_VERSION_CHECK
+#undef JSON_HEDLEY_EMPTY_BASES
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#undef JSON_HEDLEY_END_C_DECLS
+#undef JSON_HEDLEY_FLAGS
+#undef JSON_HEDLEY_FLAGS_CAST
+#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#undef JSON_HEDLEY_GCC_HAS_FEATURE
+#undef JSON_HEDLEY_GCC_HAS_WARNING
+#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#undef JSON_HEDLEY_GCC_VERSION
+#undef JSON_HEDLEY_GCC_VERSION_CHECK
+#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#undef JSON_HEDLEY_GNUC_HAS_WARNING
+#undef JSON_HEDLEY_GNUC_VERSION
+#undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#undef JSON_HEDLEY_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_BUILTIN
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_EXTENSION
+#undef JSON_HEDLEY_HAS_FEATURE
+#undef JSON_HEDLEY_HAS_WARNING
+#undef JSON_HEDLEY_IAR_VERSION
+#undef JSON_HEDLEY_IAR_VERSION_CHECK
+#undef JSON_HEDLEY_IBM_VERSION
+#undef JSON_HEDLEY_IBM_VERSION_CHECK
+#undef JSON_HEDLEY_IMPORT
+#undef JSON_HEDLEY_INLINE
+#undef JSON_HEDLEY_INTEL_VERSION
+#undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#undef JSON_HEDLEY_IS_CONSTANT
+#undef JSON_HEDLEY_IS_CONSTEXPR_
+#undef JSON_HEDLEY_LIKELY
+#undef JSON_HEDLEY_MALLOC
+#undef JSON_HEDLEY_MESSAGE
+#undef JSON_HEDLEY_MSVC_VERSION
+#undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#undef JSON_HEDLEY_NEVER_INLINE
+#undef JSON_HEDLEY_NON_NULL
+#undef JSON_HEDLEY_NO_ESCAPE
+#undef JSON_HEDLEY_NO_RETURN
+#undef JSON_HEDLEY_NO_THROW
+#undef JSON_HEDLEY_NULL
+#undef JSON_HEDLEY_PELLES_VERSION
+#undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#undef JSON_HEDLEY_PGI_VERSION
+#undef JSON_HEDLEY_PGI_VERSION_CHECK
+#undef JSON_HEDLEY_PREDICT
+#undef JSON_HEDLEY_PRINTF_FORMAT
+#undef JSON_HEDLEY_PRIVATE
+#undef JSON_HEDLEY_PUBLIC
+#undef JSON_HEDLEY_PURE
+#undef JSON_HEDLEY_REINTERPRET_CAST
+#undef JSON_HEDLEY_REQUIRE
+#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#undef JSON_HEDLEY_REQUIRE_MSG
+#undef JSON_HEDLEY_RESTRICT
+#undef JSON_HEDLEY_RETURNS_NON_NULL
+#undef JSON_HEDLEY_SENTINEL
+#undef JSON_HEDLEY_STATIC_ASSERT
+#undef JSON_HEDLEY_STATIC_CAST
+#undef JSON_HEDLEY_STRINGIFY
+#undef JSON_HEDLEY_STRINGIFY_EX
+#undef JSON_HEDLEY_SUNPRO_VERSION
+#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#undef JSON_HEDLEY_TINYC_VERSION
+#undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#undef JSON_HEDLEY_TI_ARMCL_VERSION
+#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL2000_VERSION
+#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL430_VERSION
+#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL6X_VERSION
+#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL7X_VERSION
+#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CLPRU_VERSION
+#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#undef JSON_HEDLEY_TI_VERSION
+#undef JSON_HEDLEY_TI_VERSION_CHECK
+#undef JSON_HEDLEY_UNAVAILABLE
+#undef JSON_HEDLEY_UNLIKELY
+#undef JSON_HEDLEY_UNPREDICTABLE
+#undef JSON_HEDLEY_UNREACHABLE
+#undef JSON_HEDLEY_UNREACHABLE_RETURN
+#undef JSON_HEDLEY_VERSION
+#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#undef JSON_HEDLEY_VERSION_ENCODE
+#undef JSON_HEDLEY_WARNING
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#undef JSON_HEDLEY_FALL_THROUGH
+
+
+
+#endif  // INCLUDE_NLOHMANN_JSON_HPP_
diff --git a/source/c906_opt/Kconfig b/source/c906_opt/Kconfig
index e78db7c4..d456b8ce 100644
--- a/source/c906_opt/Kconfig
+++ b/source/c906_opt/Kconfig
@@ -110,14 +110,14 @@ config C906_DEPTHWISE_CONVOLUTION_FP32
 	bool "Layer depthwise convolution fp32"
 	default y
 	help
-		Select SHL build v extension optimized convolution
+		Select SHL build v extension optimized depthwise_convolution
 
 config C906_DEPTHWISE_CONVOLUTION_FP16
 	depends on C906_SOURCE
 	bool "Layer depthwise convolution fp16"
 	default y
 	help
-		Select SHL build v extension optimized convolution
+		Select SHL build v extension optimized depthwise_convolution
 
 config C906_CONVOLUTION1D_FP32
 	depends on C906_SOURCE
@@ -138,21 +138,21 @@ config C906_DEPTHWISE_CONVOLUTION1D_FP16
 	bool "Layer depthwise convolution1d fp16"
 	default y
 	help
-		Select SHL build v extension optimized convolution1d
+		Select SHL build v extension optimized depthwise_convolution1d
 
 config C906_DIV_FP32
 	depends on C906_SOURCE
 	bool "Layer div fp32"
 	default y
 	help
-		Select SHL build v extension optimized convolution1d
+		Select SHL build v extension optimized div
 
 config C906_DIV_FP16
 	depends on C906_SOURCE
 	bool "Layer div fp16"
 	default y
 	help
-		Select SHL build v extension optimized convolution1d
+		Select SHL build v extension optimized div
 
 config C906_FULLYCONNECTED_FP16
 	depends on C906_SOURCE
@@ -166,21 +166,21 @@ config C906_GEMM_FP32
 	bool "Layer GEMM fp32"
 	default y
 	help
-		Select SHL build v extension optimized fullyconnected
+		Select SHL build v extension optimized gemm
 
 config C906_GEMM_FP16
 	depends on C906_SOURCE
 	bool "Layer GEMM fp16"
 	default y
 	help
-		Select SHL build v extension optimized fullyconnected
+		Select SHL build v extension optimized gemm
 
 config C906_GEMV_FP16
 	depends on C906_SOURCE
 	bool "Layer GEMV fp16"
 	default y
 	help
-		Select SHL build v extension optimized fullyconnected
+		Select SHL build v extension optimized gemv
 
 config C906_GLOBAL_AVERAGEPOOL_FP32
 	depends on C906_SOURCE
@@ -327,14 +327,14 @@ config C906_RELU1_FP32
 	bool "Layer relu1 fp32"
 	default y
 	help
-		Select SHL build v extension optimized relu
+		Select SHL build v extension optimized relu1
 
 config C906_RELU1_FP16
 	depends on C906_SOURCE
 	bool "Layer relu1 fp16"
 	default y
 	help
-		Select SHL build v extension optimized relu
+		Select SHL build v extension optimized relu1
 
 config C906_RELU6_FP32
 	depends on C906_SOURCE
diff --git a/source/c906_opt/capability.c b/source/c906_opt/capability.c
index 2bc90d65..9472c1d2 100644
--- a/source/c906_opt/capability.c
+++ b/source/c906_opt/capability.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static int common_all_support(struct csinn_tensor *input, struct csinn_params_base *base)
 {
@@ -84,9 +84,9 @@ int shl_c906_conv1d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
 {
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_w = params->stride_width;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_w = params->dilation_width;
     if (input->dtype == CSINN_DTYPE_FLOAT16) {
-        if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) {
+        if (kernel_w == 1 && stride_w == 1 && dilation_w == 1) {
             if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
                 return CSINN_OPT_INTRINSIC;
             } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
@@ -98,7 +98,7 @@ int shl_c906_conv1d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
             return CSINN_OPT_C_REFERENCE;
         }
     } else if (input->dtype == CSINN_DTYPE_FLOAT32) {
-        if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) {
+        if (kernel_w == 1 && stride_w == 1 && dilation_w == 1) {
             return CSINN_OPT_ASM;
         } else {
             return CSINN_OPT_C_REFERENCE;
@@ -450,25 +450,24 @@ int shl_c906_lrn_cap(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_c906_matmul_cap(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                         struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
-    const int dims_count = mat0->dim_count;
     int batches_a = 1;
     int batches_b = 1;
 
     /* compute the outer size */
-    for (int i = 0; i < dims_count - 2; i++) {
+    for (int i = 0; i < mat0->dim_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     if (mat0->dtype == CSINN_DTYPE_FLOAT32 && mat1->dtype == CSINN_DTYPE_FLOAT32 ||
         mat0->dtype == CSINN_DTYPE_FLOAT16 &&
             (mat1->dtype == CSINN_DTYPE_FLOAT16 || mat1->dtype == CSINN_DTYPE_INT8)) {
-        if (batches_a == batches_b) {
-            if (!params->trans_a && !params->trans_b) {
+        if (!params->trans_a && !params->trans_b) {
+            if (batches_a == batches_b) {
                 return CSINN_OPT_INTRINSIC;
-            }
-        } else if (batches_a > 1 && batches_b == 1) {
-            if (!params->trans_a && !params->trans_b) {
+            } else if (batches_a > 1 && batches_b == 1) {
                 return CSINN_OPT_INTRINSIC;
             }
         }
diff --git a/source/c906_opt/fp16/abs.c b/source/c906_opt/fp16/abs.c
index f55ecdbb..c4a9300f 100644
--- a/source/c906_opt/fp16/abs.c
+++ b/source/c906_opt/fp16/abs.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_abs_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/c906_opt/fp16/add.c b/source/c906_opt/fp16/add.c
index e7efe9f1..c654d1f2 100644
--- a/source/c906_opt/fp16/add.c
+++ b/source/c906_opt/fp16/add.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static int tail_coincide(struct csinn_tensor *input0, struct csinn_tensor *input1)
 {
diff --git a/source/c906_opt/fp16/avgpool.c b/source/c906_opt/fp16/avgpool.c
index ebf9e84b..e5111535 100644
--- a/source/c906_opt/fp16/avgpool.c
+++ b/source/c906_opt/fp16/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     pad_left = pad_top = 0
diff --git a/source/c906_opt/fp16/cache_conv1d.c b/source/c906_opt/fp16/cache_conv1d.c
index 474e69f4..9ad9fa1b 100644
--- a/source/c906_opt/fp16/cache_conv1d.c
+++ b/source/c906_opt/fp16/cache_conv1d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_tensor *weight, struct csinn_tensor *bias,
diff --git a/source/c906_opt/fp16/cache_matmul.c b/source/c906_opt/fp16/cache_matmul.c
index 9e09338d..31e7556d 100644
--- a/source/c906_opt/fp16/cache_matmul.c
+++ b/source/c906_opt/fp16/cache_matmul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 #include "shl_memory.h"
 
 // asr data buffer
diff --git a/source/c906_opt/fp16/clip.c b/source/c906_opt/fp16/clip.c
index 35fcd371..2b25f7f3 100644
--- a/source/c906_opt/fp16/clip.c
+++ b/source/c906_opt/fp16/clip.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_clip_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_clip_params *params)
diff --git a/source/c906_opt/fp16/concat.c b/source/c906_opt/fp16/concat.c
index 336abcaf..f878a6a1 100644
--- a/source/c906_opt/fp16/concat.c
+++ b/source/c906_opt/fp16/concat.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /* XXX:量化信息传播，输入输出量化信息一致？ */
 int shl_c906_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output,
diff --git a/source/c906_opt/fp16/convolution.c b/source/c906_opt/fp16/convolution.c
index c7bbb31b..136401e0 100644
--- a/source/c906_opt/fp16/convolution.c
+++ b/source/c906_opt/fp16/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
    only support layout:NCHW
@@ -37,8 +37,8 @@ int shl_c906_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     if (input->sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
@@ -59,8 +59,8 @@ int shl_c906_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
 
     /* if recommend GEMM, all conv2d use GEMM */
     if (params->conv_extra.conv_mode == CSINN_GEMM) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             cb->exec = shl_c906_conv1x1s1_sgemm_fp16;
         } else {
             cb->exec = shl_c906_conv_im2col_sgemm_fp16;
@@ -68,17 +68,25 @@ int shl_c906_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
         return CSINN_TRUE;
     }
 
-    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-        dalition_w == 1) {
+    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+        dilation_w == 1) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, params);
+        if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+            shl_c906_conv1x1s1_sgemm_transform_kernel_fp16_w_int8(kernel, params);
+        } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+            shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, params);
+        }
         cb->exec = shl_c906_conv1x1s1_sgemm_fp16;
         // winograd convolution condition:
     } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-               dalition_h == 1 && dalition_w == 1) {
-        if (params->group > 1) {
+               dilation_h == 1 && dilation_w == 1) {
+        if (params->group > 1 || (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8)) {
             params->conv_extra.conv_mode = CSINN_GEMM;
-            shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_c906_conv_im2col_sgemm_transform_kernel_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+            }
             cb->exec = shl_c906_conv_im2col_sgemm_fp16;
             return CSINN_TRUE;
         }
@@ -91,12 +99,20 @@ int shl_c906_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
             cb->exec = shl_c906_conv3x3s1_winograd64_pack8_fp16;
         } else {
             params->conv_extra.conv_mode = CSINN_GEMM;
-            shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_c906_conv_im2col_sgemm_transform_kernel_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+            }
             cb->exec = shl_c906_conv_im2col_sgemm_fp16;
         }
     } else {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+        if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+            shl_c906_conv_im2col_sgemm_transform_kernel_fp16_w_int8(kernel, params);
+        } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+            shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+        }
         cb->exec = shl_c906_conv_im2col_sgemm_fp16;
     }
     return CSINN_TRUE;
diff --git a/source/c906_opt/fp16/convolution1d.c b/source/c906_opt/fp16/convolution1d.c
index 3244e6f8..57021a88 100644
--- a/source/c906_opt/fp16/convolution1d.c
+++ b/source/c906_opt/fp16/convolution1d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_conv1d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -27,10 +27,10 @@ int shl_c906_conv1d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t in_w = input->dim[2];
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_w = params->stride_width;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
-    if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) {
+    if (kernel_w == 1 && stride_w == 1 && dilation_w == 1) {
         if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
             shl_c906_conv1x1s1_sgemm_transform_kernel_fp16_w_int8(
                 kernel, (struct csinn_conv2d_params *)params);
diff --git a/source/c906_opt/fp16/convolution_1x1_fp16.c b/source/c906_opt/fp16/convolution_1x1_fp16.c
index 466f4331..da99006f 100644
--- a/source/c906_opt/fp16/convolution_1x1_fp16.c
+++ b/source/c906_opt/fp16/convolution_1x1_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 void shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel,
                                                     struct csinn_conv2d_params *params)
@@ -66,13 +66,16 @@ int shl_c906_conv1x1s1_sgemm_fp16(struct csinn_tensor *input, struct csinn_tenso
 
     __fp16 *kernel_fp16 = NULL;
     if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
-        // TODO: support per-channel quantization
-        int32_t zp = kernel->qinfo->zero_point;
-        float scale = kernel->qinfo->scale;
         int size = csinn_tensor_size(kernel);
-        int8_t *kernel_int8 = (int8_t *)kernel->data;
         kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
-        shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16(kernel, params, kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
         kernel_data = kernel_fp16;
     } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
         kernel_data = (__fp16 *)kernel->data;
@@ -113,6 +116,7 @@ int shl_c906_conv1x1s1_sgemm_fp16(struct csinn_tensor *input, struct csinn_tenso
     shl_mem_free(pb_reorder);
     if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
         shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
     }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
diff --git a/source/c906_opt/fp16/convolution_3x3_fp16.c b/source/c906_opt/fp16/convolution_3x3_fp16.c
index 3d06b11d..3a4bb547 100644
--- a/source/c906_opt/fp16/convolution_3x3_fp16.c
+++ b/source/c906_opt/fp16/convolution_3x3_fp16.c
@@ -24,7 +24,7 @@
     input_width <= 120
 */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     padding input for winograd input transform , and change memory layout to [n c/8 h w 8]
diff --git a/source/c906_opt/fp16/convolution_gemm_fp16.c b/source/c906_opt/fp16/convolution_gemm_fp16.c
index 07299930..dbd9ef62 100644
--- a/source/c906_opt/fp16/convolution_gemm_fp16.c
+++ b/source/c906_opt/fp16/convolution_gemm_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -39,13 +39,35 @@ void shl_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csinn_tensor *kerne
     shl_mem_free(pa_reorder);
 }
 
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c906_conv_im2col_sgemm_transform_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                             struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // m = out_ch / group
+    int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(group * m * k * sizeof(int8_t));
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp16_w_int8(kernel_data + g * m * k, pa_reorder + g * m * k, m, k,
+                                              k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
 int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                     struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -62,7 +84,7 @@ int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_ten
     int32_t stride_w = params->stride_width;
     int32_t pad_left = params->pad_left;
     int32_t pad_top = params->pad_top;
-    int32_t pad_if_zero = pad_left + pad_top + params->pad_right + params->pad_down;
+    int32_t pad_non_zero = pad_left + pad_top + params->pad_right + params->pad_down;
     int32_t dilation_h = params->dilation_height;
     int32_t dilation_w = params->dilation_width;
     int channel_col = in_ch / group * ksize_h * ksize_w;
@@ -71,56 +93,30 @@ int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_ten
     int32_t k = in_ch / group * ksize_h * ksize_w;
     int32_t n = out_height * out_width;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16(kernel, params, kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
     __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
-    if (pad_if_zero) {
-        for (int i = 0; i < batch; i++) {
-            for (int g = 0; g < group; g++) {
-                // im2col
-                __fp16 *data_col = im2col_data;
-                __fp16 *channel_data = input_data;
-                for (int c = 0; c < in_ch / group; c++) {
-                    for (int kh = 0; kh < ksize_h; kh++) {
-                        for (int kw = 0; kw < ksize_w; kw++) {
-                            int in_row = -pad_top + kh * dilation_h;
-                            for (int oh = 0; oh < out_height; oh++) {
-                                if (in_row >= in_height || in_row < 0) {
-                                    for (int ow = 0; ow < out_width; ow++) {
-                                        *data_col++ = 0.0f;
-                                    }
-                                } else {
-                                    int in_col = -pad_left + kw * dilation_w;
-                                    for (int ow1 = 0; ow1 < out_width; ow1++) {
-                                        int col_idx = (c * out_height + oh) * out_width + ow1;
-                                        if (in_col < in_width && in_col >= 0) {
-                                            *data_col++ = channel_data[in_row * in_width + in_col];
-                                        } else {
-                                            *data_col++ = 0.0f;
-                                        }
-                                        in_col += stride_w;
-                                    }
-                                }
-                                in_row += stride_h;
-                            }
-                        }
-                    }
-                    channel_data += in_height * in_width;
-                }
-
-                __fp16 *pa = kernel_data + g * m * k;
-                __fp16 *pb = pb_reorder;
-                __fp16 *pc = output_data;
-
-                // pack
-                shl_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n);
-                // GEMM
-                shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
-                input_data += in_ch / group * in_height * in_width;
-                output_data += m * n;
-            }
-        }
-    } else {
+    if (!pad_non_zero && dilation_h == 1 && dilation_w == 1) {
         for (int i = 0; i < batch; i++) {
             for (int g = 0; g < group; g++) {
                 // im2col
@@ -178,10 +174,60 @@ int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_ten
                 output_data += m * n;
             }
         }
+    } else {
+        for (int i = 0; i < batch; i++) {
+            for (int g = 0; g < group; g++) {
+                // im2col
+                __fp16 *data_col = im2col_data;
+                __fp16 *channel_data = input_data;
+                for (int c = 0; c < in_ch / group; c++) {
+                    for (int kh = 0; kh < ksize_h; kh++) {
+                        for (int kw = 0; kw < ksize_w; kw++) {
+                            int in_row = -pad_top + kh * dilation_h;
+                            for (int oh = 0; oh < out_height; oh++) {
+                                if (in_row >= in_height || in_row < 0) {
+                                    for (int ow = 0; ow < out_width; ow++) {
+                                        *data_col++ = 0.0f;
+                                    }
+                                } else {
+                                    int in_col = -pad_left + kw * dilation_w;
+                                    for (int ow1 = 0; ow1 < out_width; ow1++) {
+                                        int col_idx = (c * out_height + oh) * out_width + ow1;
+                                        if (in_col < in_width && in_col >= 0) {
+                                            *data_col++ = channel_data[in_row * in_width + in_col];
+                                        } else {
+                                            *data_col++ = 0.0f;
+                                        }
+                                        in_col += stride_w;
+                                    }
+                                }
+                                in_row += stride_h;
+                            }
+                        }
+                    }
+                    channel_data += in_height * in_width;
+                }
+
+                __fp16 *pa = kernel_data + g * m * k;
+                __fp16 *pb = pb_reorder;
+                __fp16 *pc = output_data;
+
+                // pack
+                shl_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
+                input_data += in_ch / group * in_height * in_width;
+                output_data += m * n;
+            }
+        }
     }
 
     shl_mem_free(pb_reorder);
     shl_mem_free(im2col_data);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
diff --git a/source/c906_opt/fp16/depthwise_convolution.c b/source/c906_opt/fp16/depthwise_convolution.c
index eb636a4a..189bac34 100644
--- a/source/c906_opt/fp16/depthwise_convolution.c
+++ b/source/c906_opt/fp16/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/c906_opt/fp16/depthwise_convolution1d.c b/source/c906_opt/fp16/depthwise_convolution1d.c
index 4bdac106..e7e10cac 100644
--- a/source/c906_opt/fp16/depthwise_convolution1d.c
+++ b/source/c906_opt/fp16/depthwise_convolution1d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_dwconv8s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -27,15 +27,31 @@ int shl_c906_dwconv8s1_fp16(struct csinn_tensor *input, struct csinn_tensor *out
     __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_w = input->dim[2];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_w = output->dim[2];
+
     __fp16 *kernel_fp16 = NULL;
     if (kernel->dtype == CSINN_DTYPE_INT8) {
-        // TODO: support per-channel quantization
-        int32_t zp = kernel->qinfo->zero_point;
-        float scale = kernel->qinfo->scale;
-        int kernel_size = csinn_tensor_size(kernel);
+        int size = csinn_tensor_size(kernel);
         int8_t *kernel_int8 = (int8_t *)kernel->data;
-        kernel_fp16 = (__fp16 *)shl_mem_alloc(kernel_size * sizeof(__fp16));
-        shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, kernel_size, zp, scale);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2];
+            for (int c = 0; c < in_c; c++) {
+                int32_t zp = kernel->qinfo[c].zero_point;
+                float scale = kernel->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(kernel_int8 + c * maxk, kernel_fp16 + c * maxk, maxk,
+                                             zp, scale);
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
         kernel_data = kernel_fp16;
     } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
         kernel_data = (__fp16 *)kernel->data;
@@ -44,13 +60,6 @@ int shl_c906_dwconv8s1_fp16(struct csinn_tensor *input, struct csinn_tensor *out
         return CSINN_FALSE;
     }
 
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];  // group = in_channel
-    int32_t in_w = input->dim[2];
-
-    int32_t out_c = output->dim[1];
-    int32_t out_w = output->dim[2];
-
     if (params->pad_left == 0 && params->pad_right == 0) {
         for (int c = 0; c < in_c; c++) {
             __fp16 *out0 = output_data + c * out_w;
@@ -97,6 +106,7 @@ int shl_c906_dwconv8s1_fp16(struct csinn_tensor *input, struct csinn_tensor *out
                 vfloat16m1_t _acc0_tmp =
                     vfredusum_vs_f16m1_f16m1(vundefined_f16m1(), _acc0, _tmp, vl);
                 __fp16 res0 = vfmv_f_s_f16m1_f16(_acc0_tmp);
+                img0 += 1;
                 *out0++ = res0;
             }
         }
diff --git a/source/c906_opt/fp16/depthwise_convolution_3x3_fp16.c b/source/c906_opt/fp16/depthwise_convolution_3x3_fp16.c
index cb42cd40..3cfcd1e1 100644
--- a/source/c906_opt/fp16/depthwise_convolution_3x3_fp16.c
+++ b/source/c906_opt/fp16/depthwise_convolution_3x3_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     (1) Algorithm works as follows:
@@ -63,7 +63,7 @@ int shl_c906_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *o
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t batch = input->dim[0];
@@ -75,6 +75,32 @@ int shl_c906_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        int8_t *kernel_int8 = (int8_t *)kernel->data;
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2] * kernel->dim[3];
+            for (int c = 0; c < in_c; c++) {
+                int32_t zp = kernel->qinfo[c].zero_point;
+                float scale = kernel->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(kernel_int8 + c * maxk, kernel_fp16 + c * maxk, maxk,
+                                             zp, scale);
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *input_padd_buf =
         (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(__fp16));
@@ -559,6 +585,10 @@ int shl_c906_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     }
 
     shl_mem_free(input_padd_buf);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
@@ -590,7 +620,7 @@ int shl_c906_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *o
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t batch = input->dim[0];
@@ -602,6 +632,32 @@ int shl_c906_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        int8_t *kernel_int8 = (int8_t *)kernel->data;
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2] * kernel->dim[3];
+            for (int c = 0; c < in_c; c++) {
+                int32_t zp = kernel->qinfo[c].zero_point;
+                float scale = kernel->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(kernel_int8 + c * maxk, kernel_fp16 + c * maxk, maxk,
+                                             zp, scale);
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *input_padd_buf =
         (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(__fp16));
@@ -814,6 +870,10 @@ int shl_c906_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     }
 
     shl_mem_free(input_padd_buf);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
diff --git a/source/c906_opt/fp16/depthwise_convolution_3x3_pack8_fp16.c b/source/c906_opt/fp16/depthwise_convolution_3x3_pack8_fp16.c
index 8747ab39..c5b62635 100644
--- a/source/c906_opt/fp16/depthwise_convolution_3x3_pack8_fp16.c
+++ b/source/c906_opt/fp16/depthwise_convolution_3x3_pack8_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /************************************************************************************************************
     c906 vlen = 128, 128/16 = 8 --> pack8, if vlen = 256  256/16 = 16 --> pack16
diff --git a/source/c906_opt/fp16/depthwise_convolution_fp16.c b/source/c906_opt/fp16/depthwise_convolution_fp16.c
index 69964808..4c2f9761 100644
--- a/source/c906_opt/fp16/depthwise_convolution_fp16.c
+++ b/source/c906_opt/fp16/depthwise_convolution_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_dwconv2d_s1_pad0_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/c906_opt/fp16/div.c b/source/c906_opt/fp16/div.c
index 1e5dca32..788bbd7d 100644
--- a/source/c906_opt/fp16/div.c
+++ b/source/c906_opt/fp16/div.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_div_init_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
                            struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/c906_opt/fp16/fullyconnected.c b/source/c906_opt/fp16/fullyconnected.c
index 688006ee..40c39356 100644
--- a/source/c906_opt/fp16/fullyconnected.c
+++ b/source/c906_opt/fp16/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     change memory layout for weight matrix [out_nodes * in_nodes] by N(8) shape
@@ -781,13 +781,23 @@ int shl_c906_fullyconnected_pack16_fp16(struct csinn_tensor *input, struct csinn
 
     __fp16 *weights_fp16 = NULL;
     if (weights->is_const && weights->dtype == CSINN_DTYPE_INT8) {
-        // TODO: support per-channel quantization
-        int32_t zp = weights->qinfo->zero_point;
-        float scale = weights->qinfo->scale;
         int size = csinn_tensor_size(weights);
         int8_t *weights_int8 = (int8_t *)weights->data;
         weights_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
-        shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale);
+        if (weights->quant_channel == 1) {
+            int32_t zp = weights->qinfo->zero_point;
+            float scale = weights->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale);
+        } else if (weights->quant_channel == output_depth) {
+            // support channel quantization
+            for (int c = 0; c < output_depth; c++) {
+                int32_t zp = weights->qinfo[c].zero_point;
+                float scale = weights->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth,
+                                             weights_fp16 + c * accum_depth, accum_depth, zp,
+                                             scale);
+            }
+        }
         weights_data = weights_fp16;
     } else if (weights->dtype == CSINN_DTYPE_FLOAT16) {
         weights_data = (__fp16 *)weights->data;
@@ -933,12 +943,23 @@ int shl_c906_fullyconnected_pack16_output16_fp16(struct csinn_tensor *input,
 
     __fp16 *weights_fp16 = NULL;
     if (weights->is_const && weights->dtype == CSINN_DTYPE_INT8) {
-        int32_t zp = weights->qinfo->zero_point;
-        float scale = weights->qinfo->scale;
         int size = csinn_tensor_size(weights);
         int8_t *weights_int8 = (int8_t *)weights->data;
         weights_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
-        shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale);
+        if (weights->quant_channel == 1) {
+            int32_t zp = weights->qinfo->zero_point;
+            float scale = weights->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale);
+        } else if (weights->quant_channel == output_depth) {
+            // support channel quantization
+            for (int c = 0; c < output_depth; c++) {
+                int32_t zp = weights->qinfo[c].zero_point;
+                float scale = weights->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth,
+                                             weights_fp16 + c * accum_depth, accum_depth, zp,
+                                             scale);
+            }
+        }
         weights_data = weights_fp16;
     } else if (weights->dtype == CSINN_DTYPE_FLOAT16) {
         weights_data = (__fp16 *)weights->data;
diff --git a/source/c906_opt/fp16/gemm_fp16.c b/source/c906_opt/fp16/gemm_fp16.c
index b171a459..4003a3be 100644
--- a/source/c906_opt/fp16/gemm_fp16.c
+++ b/source/c906_opt/fp16/gemm_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     (1) Algorithm works as follows:
diff --git a/source/c906_opt/fp16/gemv_fp16.c b/source/c906_opt/fp16/gemv_fp16.c
index ebb80532..a18717e0 100644
--- a/source/c906_opt/fp16/gemv_fp16.c
+++ b/source/c906_opt/fp16/gemv_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     change memory layout for matrix [k * n] by Z shape
diff --git a/source/c906_opt/fp16/global_avgpool.c b/source/c906_opt/fp16/global_avgpool.c
index a0d2058a..4f38becd 100644
--- a/source/c906_opt/fp16/global_avgpool.c
+++ b/source/c906_opt/fp16/global_avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                    struct csinn_pool_params *params)
diff --git a/source/c906_opt/fp16/global_maxpool.c b/source/c906_opt/fp16/global_maxpool.c
index f4d5f795..75e0beca 100644
--- a/source/c906_opt/fp16/global_maxpool.c
+++ b/source/c906_opt/fp16/global_maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                    struct csinn_pool_params *params)
diff --git a/source/c906_opt/fp16/leaky_relu.c b/source/c906_opt/fp16/leaky_relu.c
index 24759549..17e1ed6e 100644
--- a/source/c906_opt/fp16/leaky_relu.c
+++ b/source/c906_opt/fp16/leaky_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp16/lrn.c b/source/c906_opt/fp16/lrn.c
index 726e5f0b..aa6ebf08 100644
--- a/source/c906_opt/fp16/lrn.c
+++ b/source/c906_opt/fp16/lrn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_lrn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_lrn_params *params)
diff --git a/source/c906_opt/fp16/matmul.c b/source/c906_opt/fp16/matmul.c
index 19404b16..40d1e489 100644
--- a/source/c906_opt/fp16/matmul.c
+++ b/source/c906_opt/fp16/matmul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*************************************************************
   Matmul fp16_w_int8 performance on C906@1GHz
@@ -208,15 +208,17 @@ static int matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
     /* compute the outer size */
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
-    const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
+    const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16));
 
@@ -234,11 +236,7 @@ static int matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
             shl_mem_free(in1);
             // requantize
             shl_rvv_sidcso_op_requantize_fp16(mat0, output, mat1);
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16));
 
@@ -260,8 +258,7 @@ static int matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupport this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -387,18 +384,20 @@ static int matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *ma
     /* compute the outer size */
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
-    const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
+    const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
     int32_t zp = mat1->qinfo->zero_point;
     float scale = mat1->qinfo->scale;
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             for (int b = 0; b < batches_a; b++) {
                 shl_c906_matmul_4x32_fp16_w_int8(output_data, mat0_data, mat1_data, dim_m, dim_k,
                                                  dim_n, zp, scale);
@@ -407,11 +406,7 @@ static int matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *ma
                 mat1_data += dim_n * dim_k;
                 output_data += dim_m * dim_n;
             }
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             for (int b = 0; b < batches_a; b++) {
                 /* TODO: mat1_data dequantize once */
                 shl_c906_matmul_4x32_fp16_w_int8(output_data, mat0_data, mat1_data, dim_m, dim_k,
@@ -424,8 +419,7 @@ static int matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *ma
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupport this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -499,23 +493,25 @@ int shl_c906_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *ma
 {
     struct csinn_callback *cb = params->base.cb;
     const int dim_k = mat1->dim[mat1->dim_count - (params->trans_b ? 1 : 2)];
-    if (mat0->dtype == CSINN_DTYPE_FLOAT16) {
-        if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) {
-            shl_c906_matmul_reorder_weight_z32_int8(mat1);
-        } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) {
-            if (dim_k > MATMUL_K_BLK) {
-                if (mat1->is_const) {
-                    shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+    if (!params->trans_a && !params->trans_b) {
+        if (mat0->dtype == CSINN_DTYPE_FLOAT16) {
+            if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) {
+                shl_c906_matmul_reorder_weight_z32_int8(mat1);
+            } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) {
+                if (dim_k > MATMUL_K_BLK) {
+                    if (mat1->is_const) {
+                        shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+                    }
                 }
             }
-        } else {
-            shl_debug_error("mat1 unsupported dtype: %d\n", mat1->dtype);
-            return CSINN_FALSE;
+            cb->exec = shl_c906_matmul_fp16;
         }
-        cb->exec = shl_c906_matmul_fp16;
-    } else {
-        shl_debug_error("mat0 unsupport dtype: %d\n", mat0->dtype);
-        return CSINN_FALSE;
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "matmul is not optimized to achieve under this condition, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_matmul_quant;
     }
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/fp16/maxpool.c b/source/c906_opt/fp16/maxpool.c
index 3d053d2b..c5db8514 100644
--- a/source/c906_opt/fp16/maxpool.c
+++ b/source/c906_opt/fp16/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     pad_left = pad_top = 0
diff --git a/source/c906_opt/fp16/minimum.c b/source/c906_opt/fp16/minimum.c
index 61f5bb87..7ee6a754 100644
--- a/source/c906_opt/fp16/minimum.c
+++ b/source/c906_opt/fp16/minimum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static void element_minimum_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int size)
 {
diff --git a/source/c906_opt/fp16/mul.c b/source/c906_opt/fp16/mul.c
index 3c0bda74..5d02e0bb 100644
--- a/source/c906_opt/fp16/mul.c
+++ b/source/c906_opt/fp16/mul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static void element_mul_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int size)
 {
diff --git a/source/c906_opt/fp16/pad.c b/source/c906_opt/fp16/pad.c
index a011f951..765c4ff1 100644
--- a/source/c906_opt/fp16/pad.c
+++ b/source/c906_opt/fp16/pad.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_pad_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_pad_params *params)
diff --git a/source/c906_opt/fp16/prelu.c b/source/c906_opt/fp16/prelu.c
index 53155e6b..1f41fcea 100644
--- a/source/c906_opt/fp16/prelu.c
+++ b/source/c906_opt/fp16/prelu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 // nchw layout
 int shl_c906_prelu_fp16(struct csinn_tensor *input, struct csinn_tensor *alpha,
diff --git a/source/c906_opt/fp16/reduce_sum.c b/source/c906_opt/fp16/reduce_sum.c
index 4362a7fc..43659680 100644
--- a/source/c906_opt/fp16/reduce_sum.c
+++ b/source/c906_opt/fp16/reduce_sum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 // reduce_sum
 int shl_c906_reduce_sum_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
diff --git a/source/c906_opt/fp16/relu.c b/source/c906_opt/fp16/relu.c
index 27690c8d..37706797 100644
--- a/source/c906_opt/fp16/relu.c
+++ b/source/c906_opt/fp16/relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp16/relu1.c b/source/c906_opt/fp16/relu1.c
index 0f891658..416a1160 100644
--- a/source/c906_opt/fp16/relu1.c
+++ b/source/c906_opt/fp16/relu1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_relu1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                         struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp16/relu6.c b/source/c906_opt/fp16/relu6.c
index 5091074a..a583c72b 100644
--- a/source/c906_opt/fp16/relu6.c
+++ b/source/c906_opt/fp16/relu6.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                         struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp16/reshape.c b/source/c906_opt/fp16/reshape.c
index 059e4fa3..9ed172b8 100644
--- a/source/c906_opt/fp16/reshape.c
+++ b/source/c906_opt/fp16/reshape.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                           struct csinn_reshape_params *params)
diff --git a/source/c906_opt/fp16/split.c b/source/c906_opt/fp16/split.c
index 5a6a7869..050f5951 100644
--- a/source/c906_opt/fp16/split.c
+++ b/source/c906_opt/fp16/split.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /* XXX:量化信息传播，输入输出量化信息一致？ */
 int shl_c906_split_fp16(struct csinn_tensor *input, struct csinn_tensor **output,
diff --git a/source/c906_opt/fp16/sub.c b/source/c906_opt/fp16/sub.c
index 8b11c590..9910da98 100644
--- a/source/c906_opt/fp16/sub.c
+++ b/source/c906_opt/fp16/sub.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static void element_sub_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int size)
 {
diff --git a/source/c906_opt/fp32/abs.c b/source/c906_opt/fp32/abs.c
index 1b3bf4d1..11bc3b36 100644
--- a/source/c906_opt/fp32/abs.c
+++ b/source/c906_opt/fp32/abs.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/c906_opt/fp32/add.c b/source/c906_opt/fp32/add.c
index 04769c77..96efe568 100644
--- a/source/c906_opt/fp32/add.c
+++ b/source/c906_opt/fp32/add.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static int tail_coincide(struct csinn_tensor *input0, struct csinn_tensor *input1)
 {
diff --git a/source/c906_opt/fp32/avgpool.c b/source/c906_opt/fp32/avgpool.c
index c66aab8a..91402c0c 100644
--- a/source/c906_opt/fp32/avgpool.c
+++ b/source/c906_opt/fp32/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     pad_left = pad_top = 0
diff --git a/source/c906_opt/fp32/broadcast_to.c b/source/c906_opt/fp32/broadcast_to.c
index 52623505..af2eb8fe 100644
--- a/source/c906_opt/fp32/broadcast_to.c
+++ b/source/c906_opt/fp32/broadcast_to.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_broadcast_to_params *params)
diff --git a/source/c906_opt/fp32/clip.c b/source/c906_opt/fp32/clip.c
index e5d84833..36f3750d 100644
--- a/source/c906_opt/fp32/clip.c
+++ b/source/c906_opt/fp32/clip.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_clip_params *params)
diff --git a/source/c906_opt/fp32/concat.c b/source/c906_opt/fp32/concat.c
index e7a2f08b..87ca2187 100644
--- a/source/c906_opt/fp32/concat.c
+++ b/source/c906_opt/fp32/concat.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output,
                         struct csinn_concat_params *params)
diff --git a/source/c906_opt/fp32/convolution.c b/source/c906_opt/fp32/convolution.c
index 1450d744..f9ef4af3 100644
--- a/source/c906_opt/fp32/convolution.c
+++ b/source/c906_opt/fp32/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
    only support layout:NCHW
@@ -36,8 +36,8 @@ int shl_c906_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     if (input->sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
@@ -56,14 +56,14 @@ int shl_c906_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
         return CSINN_FALSE;
     }
 
-    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-        dalition_w == 1) {
+    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+        dilation_w == 1) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, params);
         cb->exec = shl_c906_conv1x1s1_sgemm;
         // winograd convolution condition:
     } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-               dalition_h == 1 && dalition_w == 1) {
+               dilation_h == 1 && dilation_w == 1) {
         if (params->group > 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
diff --git a/source/c906_opt/fp32/convolution1d.c b/source/c906_opt/fp32/convolution1d.c
index b4ea9204..c1428079 100644
--- a/source/c906_opt/fp32/convolution1d.c
+++ b/source/c906_opt/fp32/convolution1d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_conv1d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -27,10 +27,10 @@ int shl_c906_conv1d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t in_w = input->dim[2];
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_w = params->stride_width;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
-    if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) {
+    if (kernel_w == 1 && stride_w == 1 && dilation_w == 1) {
         shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, (struct csinn_conv2d_params *)params);
         cb->exec = shl_c906_conv1x1s1_sgemm;
     } else {
diff --git a/source/c906_opt/fp32/convolution_1x1_fp32.c b/source/c906_opt/fp32/convolution_1x1_fp32.c
index 941ed62f..deedfdf4 100644
--- a/source/c906_opt/fp32/convolution_1x1_fp32.c
+++ b/source/c906_opt/fp32/convolution_1x1_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 void shl_c906_conv1x1s1_sgemm_transform_kernel(struct csinn_tensor *kernel,
                                                struct csinn_conv2d_params *params)
diff --git a/source/c906_opt/fp32/convolution_3x3_fp32.c b/source/c906_opt/fp32/convolution_3x3_fp32.c
index bccf01c4..7ecb4e5d 100644
--- a/source/c906_opt/fp32/convolution_3x3_fp32.c
+++ b/source/c906_opt/fp32/convolution_3x3_fp32.c
@@ -24,7 +24,7 @@
     input_width <= 120
 */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 void shl_c906_conv3x3s1_winograd23_transform_kernel(struct csinn_tensor *o_kernel,
                                                     struct csinn_tensor *t_kernel)
diff --git a/source/c906_opt/fp32/convolution_sgemm_fp32.c b/source/c906_opt/fp32/convolution_sgemm_fp32.c
index e2ce0b60..9215847d 100644
--- a/source/c906_opt/fp32/convolution_sgemm_fp32.c
+++ b/source/c906_opt/fp32/convolution_sgemm_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
diff --git a/source/c906_opt/fp32/depthwise_convolution.c b/source/c906_opt/fp32/depthwise_convolution.c
index 22054782..490fb4d9 100644
--- a/source/c906_opt/fp32/depthwise_convolution.c
+++ b/source/c906_opt/fp32/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/c906_opt/fp32/depthwise_convolution_3x3_fp32.c b/source/c906_opt/fp32/depthwise_convolution_3x3_fp32.c
index 4a6f9be4..017075d4 100644
--- a/source/c906_opt/fp32/depthwise_convolution_3x3_fp32.c
+++ b/source/c906_opt/fp32/depthwise_convolution_3x3_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 #ifndef DWCONV3X3S1
 #define DWCONV3X3S1 shl_c906_dwconv3x3s1
diff --git a/source/c906_opt/fp32/depthwise_convolution_3x3_pack4_fp32.c b/source/c906_opt/fp32/depthwise_convolution_3x3_pack4_fp32.c
index 3bb6166c..9c5fdaac 100644
--- a/source/c906_opt/fp32/depthwise_convolution_3x3_pack4_fp32.c
+++ b/source/c906_opt/fp32/depthwise_convolution_3x3_pack4_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 #ifndef DWCONV3X3S1_PACK4
 #define DWCONV3X3S1_PACK4 shl_c906_dwconv3x3s1_pack4
diff --git a/source/c906_opt/fp32/depthwise_convolution_5x5_fp32.c b/source/c906_opt/fp32/depthwise_convolution_5x5_fp32.c
index 520fe732..cfbb3b02 100644
--- a/source/c906_opt/fp32/depthwise_convolution_5x5_fp32.c
+++ b/source/c906_opt/fp32/depthwise_convolution_5x5_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 #ifndef DWCONV5X5S1
 #define DWCONV5X5S1 shl_c906_dwconv5x5s1
diff --git a/source/c906_opt/fp32/div.c b/source/c906_opt/fp32/div.c
index 23c31df8..aa73c86a 100644
--- a/source/c906_opt/fp32/div.c
+++ b/source/c906_opt/fp32/div.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 int shl_c906_div_init_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                            struct csinn_tensor *output, struct csinn_diso_params *params)
 {
diff --git a/source/c906_opt/fp32/gemm_fp32.c b/source/c906_opt/fp32/gemm_fp32.c
index 8385d2e0..16a4d8a2 100644
--- a/source/c906_opt/fp32/gemm_fp32.c
+++ b/source/c906_opt/fp32/gemm_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /* The matrices are stored in row-major order */
 #define A(i, j) a[(i)*lda + (j)]
diff --git a/source/c906_opt/fp32/global_avgpool.c b/source/c906_opt/fp32/global_avgpool.c
index 5296adad..940f2c87 100644
--- a/source/c906_opt/fp32/global_avgpool.c
+++ b/source/c906_opt/fp32/global_avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                   struct csinn_pool_params *params)
diff --git a/source/c906_opt/fp32/global_maxpool.c b/source/c906_opt/fp32/global_maxpool.c
index fb7d16e7..5654fa76 100644
--- a/source/c906_opt/fp32/global_maxpool.c
+++ b/source/c906_opt/fp32/global_maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                   struct csinn_pool_params *params)
diff --git a/source/c906_opt/fp32/leaky_relu.c b/source/c906_opt/fp32/leaky_relu.c
index 11a20699..4b8aee98 100644
--- a/source/c906_opt/fp32/leaky_relu.c
+++ b/source/c906_opt/fp32/leaky_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp32/matmul.c b/source/c906_opt/fp32/matmul.c
index b504913e..706a8bdb 100644
--- a/source/c906_opt/fp32/matmul.c
+++ b/source/c906_opt/fp32/matmul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*************************************************************
   Matmul fp32 performance on C906@1GHz
@@ -46,19 +46,21 @@ int shl_rvv_matmul_init_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat
                              struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     struct csinn_callback *cb = params->base.cb;
-    if (mat0->dtype == CSINN_DTYPE_FLOAT32) {
-        if (mat1->dtype == CSINN_DTYPE_FLOAT32) {
-            if (mat1->is_const) {
-                shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+    if (!params->trans_a && !params->trans_b) {
+        if (mat0->dtype == CSINN_DTYPE_FLOAT32) {
+            if (mat1->dtype == CSINN_DTYPE_FLOAT32) {
+                if (mat1->is_const) {
+                    shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+                }
+                cb->exec = shl_c906_matmul_fp32;
             }
-            cb->exec = shl_c906_matmul_fp32;
-        } else {
-            shl_debug_error("mat1 unsupported dtype: %d\n", mat1->dtype);
-            return CSINN_FALSE;
         }
-    } else {
-        shl_debug_error("mat0 unsupported dtype: %d\n", mat0->dtype);
-        return CSINN_FALSE;
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "matmul is not optimized to achieve under this condition, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_matmul_quant;
     }
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/fp32/maxpool.c b/source/c906_opt/fp32/maxpool.c
index 085917d9..4a9d1709 100644
--- a/source/c906_opt/fp32/maxpool.c
+++ b/source/c906_opt/fp32/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     pad_left = pad_top = 0
diff --git a/source/c906_opt/fp32/minimum.c b/source/c906_opt/fp32/minimum.c
index 89a47c26..1d2c0478 100644
--- a/source/c906_opt/fp32/minimum.c
+++ b/source/c906_opt/fp32/minimum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static void element_minimum_f32(float *input0, float *input1, float *output, int size)
 {
diff --git a/source/c906_opt/fp32/mul.c b/source/c906_opt/fp32/mul.c
index ee7c425f..cbcbc77f 100644
--- a/source/c906_opt/fp32/mul.c
+++ b/source/c906_opt/fp32/mul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static void element_mul_f32(float *input0, float *input1, float *output, int size)
 {
diff --git a/source/c906_opt/fp32/pad.c b/source/c906_opt/fp32/pad.c
index 96ecca73..347e2018 100644
--- a/source/c906_opt/fp32/pad.c
+++ b/source/c906_opt/fp32/pad.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 // constrain: only support pad on h and w dim
 // pad_mode: constant
diff --git a/source/c906_opt/fp32/prelu.c b/source/c906_opt/fp32/prelu.c
index e04519ba..0795a60e 100644
--- a/source/c906_opt/fp32/prelu.c
+++ b/source/c906_opt/fp32/prelu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static int shl_c906_prelu_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
                                    struct csinn_tensor *output, struct csinn_prelu_params *params)
diff --git a/source/c906_opt/fp32/relu.c b/source/c906_opt/fp32/relu.c
index d079a3ce..3efcf0df 100644
--- a/source/c906_opt/fp32/relu.c
+++ b/source/c906_opt/fp32/relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp32/relu1.c b/source/c906_opt/fp32/relu1.c
index b9229d33..ba0132d7 100644
--- a/source/c906_opt/fp32/relu1.c
+++ b/source/c906_opt/fp32/relu1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp32/relu6.c b/source/c906_opt/fp32/relu6.c
index e9f482e0..758b1868 100644
--- a/source/c906_opt/fp32/relu6.c
+++ b/source/c906_opt/fp32/relu6.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_relu_params *params)
diff --git a/source/c906_opt/fp32/split.c b/source/c906_opt/fp32/split.c
index 429f28f4..9beb5272 100644
--- a/source/c906_opt/fp32/split.c
+++ b/source/c906_opt/fp32/split.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 int shl_c906_split_f32(struct csinn_tensor *input, struct csinn_tensor **output,
                        struct csinn_split_params *params)
diff --git a/source/c906_opt/fp32/sub.c b/source/c906_opt/fp32/sub.c
index 650cfd16..70fb5427 100644
--- a/source/c906_opt/fp32/sub.c
+++ b/source/c906_opt/fp32/sub.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 static void element_sub_f32(float *input0, float *input1, float *output, int size)
 {
diff --git a/source/c906_opt/hpm.c b/source/c906_opt/hpm.c
index 6af31a88..06075e81 100644
--- a/source/c906_opt/hpm.c
+++ b/source/c906_opt/hpm.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 /*
     hpm: hardware performance monitor
@@ -26,37 +26,28 @@ struct shl_c906_hpm shl_c906_get_hw_perf()
 {
     struct shl_c906_hpm tmp;
     asm volatile(
-                "csrr %0, instret\n\t"
-                "csrr %1, cycle\n\t"
-                "csrr %2, hpmcounter3\n\t"
-                "csrr %3, hpmcounter4\n\t"
-                "csrr %4, hpmcounter13\n\t"
-                "csrr %5, hpmcounter14\n\t"
-                "csrr %6, hpmcounter15\n\t"
-                "csrr %7, hpmcounter16\n\t"
-                "csrr %8, hpmcounter17\n\t"
-
-                 : "=r"(tmp.inst),
-                   "=r"(tmp.cycle),
-                   "=r"(tmp.l1_icache_access),
-                   "=r"(tmp.l1_icache_miss),
-                   "=r"(tmp.store_inst),
-                   "=r"(tmp.l1_dcache_raccess),
-                   "=r"(tmp.l1_dcache_rmiss),
-                   "=r"(tmp.l1_dcache_waccess),
-                   "=r"(tmp.l1_dcache_wmiss)
-                 :
-                 : "memory");
+        "csrr %0, instret\n\t"
+        "csrr %1, cycle\n\t"
+        "csrr %2, hpmcounter3\n\t"
+        "csrr %3, hpmcounter4\n\t"
+        "csrr %4, hpmcounter13\n\t"
+        "csrr %5, hpmcounter14\n\t"
+        "csrr %6, hpmcounter15\n\t"
+        "csrr %7, hpmcounter16\n\t"
+        "csrr %8, hpmcounter17\n\t"
+
+        : "=r"(tmp.inst), "=r"(tmp.cycle), "=r"(tmp.l1_icache_access), "=r"(tmp.l1_icache_miss),
+          "=r"(tmp.store_inst), "=r"(tmp.l1_dcache_raccess), "=r"(tmp.l1_dcache_rmiss),
+          "=r"(tmp.l1_dcache_waccess), "=r"(tmp.l1_dcache_wmiss)
+        :
+        : "memory");
     return tmp;
 }
 
 uint64_t shl_c906_get_inst()
 {
     uint64_t inst = 0;
-    asm volatile("csrr %0, instret"
-                 : "=r"(inst)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, instret" : "=r"(inst) : : "memory");
     // asm volatile("csrr %[inst], minstret"
     //              :  [inst]"=r"(inst)
     //              :
@@ -67,14 +58,10 @@ uint64_t shl_c906_get_inst()
 uint64_t shl_c906_get_cycle()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, cycle"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, cycle" : "=r"(a) : : "memory");
     return a;
 }
 
-
 /*
     index       event                                       counter
     0x1         L1 ICache Access Counter                    mhpmcounter3
@@ -96,89 +83,62 @@ uint64_t shl_c906_get_cycle()
 uint64_t shl_c906_get_l1_icache_access()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter3"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter3" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_l1_icache_miss()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter4"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter4" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_cb_miss()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter8"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter8" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_cb_inst()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter9"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter9" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_store_inst()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter13"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter13" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_l1_dcache_raccess()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter14"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter14" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_l1_dcache_rmiss()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter15"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter15" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_l1_dcache_waccess()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter16"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter16" : "=r"(a) : : "memory");
     return a;
 }
 
 uint64_t shl_c906_get_l1_dcache_wmiss()
 {
     uint64_t a = 0;
-    asm volatile("csrr %0, hpmcounter17"
-                 : "=r"(a)
-                 :
-                 : "memory");
+    asm volatile("csrr %0, hpmcounter17" : "=r"(a) : : "memory");
     return a;
 }
diff --git a/source/c906_opt/setup.c b/source/c906_opt/setup.c
index d72a8141..a7d2e4c2 100644
--- a/source/c906_opt/setup.c
+++ b/source/c906_opt/setup.c
@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
-#include "shl_c906_cap.h"
+#include "c906/c906.h"
+#include "c906/cap.h"
 
 static struct shl_cb_op_list shl_c906_cb_op_list;
 
@@ -152,12 +152,6 @@ void __attribute__((weak)) shl_target_init_c906()
     shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c906_conv2d_init_fp32, NULL);
     shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c906_conv2d_init_fp32, NULL);
 #endif
-#ifndef CONFIG_C906_CONVOLUTION1D_FP16_DISABLED
-    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_c906_conv1d_init_fp16, NULL);
-#endif
-#ifndef CONFIG_C906_CONVOLUTION1D_FP32_DISABLED
-    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_c906_conv1d_init_fp32, NULL);
-#endif
 #ifndef CONFIG_C906_MAXPOOL_FP16_DISABLED
     shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_c906_maxpool2d_init_fp16, NULL);
 #endif
diff --git a/source/c906_opt/utils.c b/source/c906_opt/utils.c
index d041fcf0..5c952459 100644
--- a/source/c906_opt/utils.c
+++ b/source/c906_opt/utils.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c906.h"
+#include "c906/c906.h"
 
 // constrains: The destination address and source address copy do not overlap
 // notice: riscv gnu compiler tool-chain c-library memcpy may not use vector inst
@@ -24,25 +24,22 @@
 void shl_c906_memcpy(void *dst, const void *src, size_t n)
 {
     asm volatile(
-                "1:\n\t"
-                "vsetvli    t0, %3, e8, m4\n\t"
-                "vle.v      v4, (%2)\n\t"
-                "add        %2, %2, t0\n\t"
-                "sub        %3, %3, t0\n\t"
-                "vse.v      v4, (%0)\n\t"
-                "add        %0, %0, t0\n\t"
-                "bnez       %3, 1b\n\t"
-
-                :"=r"(dst)  // %0
-                :"0"(dst),  // %1
-                "r"(src),   // %2
-                "r"(n)      // %3
-                : "t0", "v4", "v5", "v6", "v7"
-    );
+        "1:\n\t"
+        "vsetvli    t0, %3, e8, m4\n\t"
+        "vle.v      v4, (%2)\n\t"
+        "add        %2, %2, t0\n\t"
+        "sub        %3, %3, t0\n\t"
+        "vse.v      v4, (%0)\n\t"
+        "add        %0, %0, t0\n\t"
+        "bnez       %3, 1b\n\t"
+
+        : "=r"(dst)  // %0
+        : "0"(dst),  // %1
+          "r"(src),  // %2
+          "r"(n)     // %3
+        : "t0", "v4", "v5", "v6", "v7");
 }
 
-
-
 /*  params:
     input:          origin input data
     input_padded:   input data after pad
@@ -61,109 +58,99 @@ void shl_c906_pad_input(const float *input, float *input_padded, int inc, int in
 
     float *pad_ptr = input_padded;
     float *inp_ptr = (float *)input;
-    int resi_h = padded_h - pad_top - inh;  // remain to pad on h (pad_down)
-    int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right)
+    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
+    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
 
 #if __riscv_vector == 128
 
     asm volatile(
         "vsetvli        zero, zero, e32, m2\n\t"
-        "vmv.v.x        v2, zero\n\t"       // clear v2
-        "mulw           t1, %6, %7\n\t"     // pad_top * padded_w
-        "mulw           t2, %6, %9\n\t"     // pad_down * padded_w
-
-    "1:\n\t"     // channel loop
-        "mv             t5, %3\n\t"         // t5 = in_h
-        "beqz           %7, 3f\n\t"         // if pad_top = 0
-        "mv             t3, t1\n\t"         // t3 = num to memset 0
-
-        "2:\n\t"    // pad h_top
-            "vsetvli        t0, t3, e32, m2\n\t"
-            "vsw.v          v2, (%1)\n\t"
-            "sub            t3, t3, t0\n\t"
-            "slli           t0, t0, 2\n\t"
-            "add            %1, %1, t0\n\t"
-            "bnez           t3, 2b\n\t"
-
-        "3:\n\t"    // pad h_mid
-
-            "mv             t4, %4\n\t"     // t4 = in_w
-            "beqz           %8, 5f\n\t"     // if pad_left = 0
-            "mv             t3, %8\n\t"     // t3 = pad_left
-
-            "4:\n\t"    // pad w_left
-                "vsetvli        t0, t3, e32, m2\n\t"
-                "vsw.v          v2, (%1)\n\t"
-                "sub            t3, t3, t0\n\t"
-                "slli           t0, t0, 2\n\t"
-                "add            %1, %1, t0\n\t"
-                "bnez           t3, 4b\n\t"
-
-            "5:\n\t"    // pad w_mid
-                "vsetvli        t0, t4, e32, m2\n\t"
-                "vlw.v          v4, (%0)\n\t"   // load from input_data
-                "sub            t4, t4, t0\n\t"
-                "slli           t0, t0, 2\n\t"
-                "add            %0, %0, t0\n\t"
-                "vsw.v          v4, (%1)\n\t"   // store to padded_buf
-                "add            %1, %1, t0\n\t"
-                "bnez           t4, 5b\n\t"
-
-                "beqz           %10, 7f\n\t"    // if pad_right = 0
-                "mv             t3, %10\n\t"
-
-            "6:\n\t"    // pad w_right
-                "vsetvli        t0, t3, e32, m2\n\t"
-                "vsw.v          v2, (%1)\n\t"
-                "sub            t3, t3, t0\n\t"
-                "slli           t0, t0, 2\n\t"
-                "add            %1, %1, t0\n\t"
-                "bnez           t3, 6b\n\t"
+        "vmv.v.x        v2, zero\n\t"    // clear v2
+        "mulw           t1, %6, %7\n\t"  // pad_top * padded_w
+        "mulw           t2, %6, %9\n\t"  // pad_down * padded_w
+
+        "1:\n\t"                     // channel loop
+        "mv             t5, %3\n\t"  // t5 = in_h
+        "beqz           %7, 3f\n\t"  // if pad_top = 0
+        "mv             t3, t1\n\t"  // t3 = num to memset 0
+
+        "2:\n\t"  // pad h_top
+        "vsetvli        t0, t3, e32, m2\n\t"
+        "vsw.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 2\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "3:\n\t"  // pad h_mid
+
+        "mv             t4, %4\n\t"  // t4 = in_w
+        "beqz           %8, 5f\n\t"  // if pad_left = 0
+        "mv             t3, %8\n\t"  // t3 = pad_left
+
+        "4:\n\t"  // pad w_left
+        "vsetvli        t0, t3, e32, m2\n\t"
+        "vsw.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 2\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"  // pad w_mid
+        "vsetvli        t0, t4, e32, m2\n\t"
+        "vlw.v          v4, (%0)\n\t"  // load from input_data
+        "sub            t4, t4, t0\n\t"
+        "slli           t0, t0, 2\n\t"
+        "add            %0, %0, t0\n\t"
+        "vsw.v          v4, (%1)\n\t"  // store to padded_buf
+        "add            %1, %1, t0\n\t"
+        "bnez           t4, 5b\n\t"
+
+        "beqz           %10, 7f\n\t"  // if pad_right = 0
+        "mv             t3, %10\n\t"
+
+        "6:\n\t"  // pad w_right
+        "vsetvli        t0, t3, e32, m2\n\t"
+        "vsw.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 2\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 6b\n\t"
 
         "7:\n\t"
 
-            "addi           t5, t5, -1\n\t"
-            "bnez           t5, 3b\n\t"
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
 
-        "beqz           %9, 9f\n\t"     // if pad_down = 0
-        "mv             t3, t2\n\t"     // t3 = num to memset 0
+        "beqz           %9, 9f\n\t"  // if pad_down = 0
+        "mv             t3, t2\n\t"  // t3 = num to memset 0
 
-        "8:\n\t"     // pad h_down
-            "vsetvli        t0, t3, e32, m2\n\t"
-            "vsw.v          v2, (%1)\n\t"
-            "sub            t3, t3, t0\n\t"
-            "slli           t0, t0, 2\n\t"
-            "add            %1, %1, t0\n\t"
-            "bnez           t3, 8b\n\t"
+        "8:\n\t"  // pad h_down
+        "vsetvli        t0, t3, e32, m2\n\t"
+        "vsw.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 2\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 8b\n\t"
 
-    "9:\n\t"
+        "9:\n\t"
         "addi           %2, %2, -1\n\t"
         "bnez           %2, 1b\n\t"
 
-        :"=r"(inp_ptr),     // %0
-        "=r"(pad_ptr),      // %1
-        "=r"(inc),          // %2
-        "=r"(inh),          // %3
-        "=r"(inw),          // %4
-        "=r"(padded_hw),    // %5
-        "=r"(padded_w),     // %6
-        "=r"(pad_top),      // %7
-        "=r"(pad_left),     // %8
-        "=r"(resi_h),       // %9
-        "=r"(resi_w)        // %10
-        :"0"(inp_ptr),
-        "1"(pad_ptr),
-        "2"(inc),
-        "3"(inh),
-        "4"(inw),
-        "5"(padded_hw),
-        "6"(padded_w),
-        "7"(pad_top),
-        "8"(pad_left),
-        "9"(resi_h),
-        "10"(resi_w)
-        :"cc", "memory", "v2", "v3", "v4", "v5",
-         "t0", "t1", "t2", "t3", "t4", "t5"
+        : "=r"(inp_ptr),    // %0
+          "=r"(pad_ptr),    // %1
+          "=r"(inc),        // %2
+          "=r"(inh),        // %3
+          "=r"(inw),        // %4
+          "=r"(padded_hw),  // %5
+          "=r"(padded_w),   // %6
+          "=r"(pad_top),    // %7
+          "=r"(pad_left),   // %8
+          "=r"(resi_h),     // %9
+          "=r"(resi_w)      // %10
+        : "0"(inp_ptr), "1"(pad_ptr), "2"(inc), "3"(inh), "4"(inw), "5"(padded_hw), "6"(padded_w),
+          "7"(pad_top), "8"(pad_left), "9"(resi_h), "10"(resi_w)
+        : "cc", "memory", "v2", "v3", "v4", "v5", "t0", "t1", "t2", "t3", "t4", "t5"
 
     );
 #else
@@ -197,112 +184,101 @@ void shl_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc,
 
     __fp16 *pad_ptr = input_padded;
     __fp16 *inp_ptr = (__fp16 *)input;
-    int resi_h = padded_h - pad_top - inh;  // remain to pad on h (pad_down)
-    int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right)
+    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
+    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
 
     asm volatile(
         "vsetvli        zero, zero, e16, m2\n\t"
-        "vmv.v.x        v2, zero\n\t"       // clear v2
-        "mulw           t1, %6, %7\n\t"     // pad_top * padded_w
-        "mulw           t2, %6, %9\n\t"     // pad_down * padded_w
-
-    "1:\n\t"     // channel loop
-        "mv             t5, %3\n\t"         // t5 = in_h
-        "beqz           %7, 3f\n\t"         // if pad_top = 0
-        "mv             t3, t1\n\t"         // t3 = num to memset 0
-
-        "2:\n\t"    // pad h_top
-            "vsetvli        t0, t3, e16, m2\n\t"
-            "vse.v          v2, (%1)\n\t"
-            "sub            t3, t3, t0\n\t"
-            "slli           t0, t0, 1\n\t"
-            "add            %1, %1, t0\n\t"
-            "bnez           t3, 2b\n\t"
-
-        "3:\n\t"    // pad h_mid
-
-            "mv             t4, %4\n\t"     // t4 = in_w
-            "beqz           %8, 5f\n\t"     // if pad_left = 0
-            "mv             t3, %8\n\t"     // t3 = pad_left
-
-            "4:\n\t"    // pad w_left
-                "vsetvli        t0, t3, e16, m2\n\t"
-                "vse.v          v2, (%1)\n\t"
-                "sub            t3, t3, t0\n\t"
-                "slli           t0, t0, 1\n\t"
-                "add            %1, %1, t0\n\t"
-                "bnez           t3, 4b\n\t"
-
-            "5:\n\t"    // pad w_mid
-                "vsetvli        t0, t4, e16, m2\n\t"
-                "vle.v          v4, (%0)\n\t"   // load from input_data
-                "sub            t4, t4, t0\n\t"
-                "slli           t0, t0, 1\n\t"
-                "add            %0, %0, t0\n\t"
-                "vse.v          v4, (%1)\n\t"   // store to padded_buf
-                "add            %1, %1, t0\n\t"
-                "bnez           t4, 5b\n\t"
-
-                "beqz           %10, 7f\n\t"    // if pad_right = 0
-                "mv             t3, %10\n\t"
-
-            "6:\n\t"    // pad w_right
-                "vsetvli        t0, t3, e16, m2\n\t"
-                "vse.v          v2, (%1)\n\t"
-                "sub            t3, t3, t0\n\t"
-                "slli           t0, t0, 1\n\t"
-                "add            %1, %1, t0\n\t"
-                "bnez           t3, 6b\n\t"
+        "vmv.v.x        v2, zero\n\t"    // clear v2
+        "mulw           t1, %6, %7\n\t"  // pad_top * padded_w
+        "mulw           t2, %6, %9\n\t"  // pad_down * padded_w
+
+        "1:\n\t"                     // channel loop
+        "mv             t5, %3\n\t"  // t5 = in_h
+        "beqz           %7, 3f\n\t"  // if pad_top = 0
+        "mv             t3, t1\n\t"  // t3 = num to memset 0
+
+        "2:\n\t"  // pad h_top
+        "vsetvli        t0, t3, e16, m2\n\t"
+        "vse.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 1\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "3:\n\t"  // pad h_mid
+
+        "mv             t4, %4\n\t"  // t4 = in_w
+        "beqz           %8, 5f\n\t"  // if pad_left = 0
+        "mv             t3, %8\n\t"  // t3 = pad_left
+
+        "4:\n\t"  // pad w_left
+        "vsetvli        t0, t3, e16, m2\n\t"
+        "vse.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 1\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"  // pad w_mid
+        "vsetvli        t0, t4, e16, m2\n\t"
+        "vle.v          v4, (%0)\n\t"  // load from input_data
+        "sub            t4, t4, t0\n\t"
+        "slli           t0, t0, 1\n\t"
+        "add            %0, %0, t0\n\t"
+        "vse.v          v4, (%1)\n\t"  // store to padded_buf
+        "add            %1, %1, t0\n\t"
+        "bnez           t4, 5b\n\t"
+
+        "beqz           %10, 7f\n\t"  // if pad_right = 0
+        "mv             t3, %10\n\t"
+
+        "6:\n\t"  // pad w_right
+        "vsetvli        t0, t3, e16, m2\n\t"
+        "vse.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 1\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 6b\n\t"
 
         "7:\n\t"
 
-            "addi           t5, t5, -1\n\t"
-            "bnez           t5, 3b\n\t"
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
 
-        "beqz           %9, 9f\n\t"     // if pad_down = 0
-        "mv             t3, t2\n\t"     // t4 = num to memset 0
+        "beqz           %9, 9f\n\t"  // if pad_down = 0
+        "mv             t3, t2\n\t"  // t4 = num to memset 0
 
-        "8:\n\t"     // pad h_down
-            "vsetvli        t0, t3, e16, m2\n\t"
-            "vse.v          v2, (%1)\n\t"
-            "sub            t3, t3, t0\n\t"
-            "slli           t0, t0, 1\n\t"
-            "add            %1, %1, t0\n\t"
-            "bnez           t3, 8b\n\t"
+        "8:\n\t"  // pad h_down
+        "vsetvli        t0, t3, e16, m2\n\t"
+        "vse.v          v2, (%1)\n\t"
+        "sub            t3, t3, t0\n\t"
+        "slli           t0, t0, 1\n\t"
+        "add            %1, %1, t0\n\t"
+        "bnez           t3, 8b\n\t"
 
-    "9:\n\t"
+        "9:\n\t"
         "addi           %2, %2, -1\n\t"
         "bnez           %2, 1b\n\t"
 
-        :"=r"(inp_ptr),     // %0
-        "=r"(pad_ptr),      // %1
-        "=r"(inc),          // %2
-        "=r"(inh),          // %3
-        "=r"(inw),          // %4
-        "=r"(padded_hw),    // %5
-        "=r"(padded_w),     // %6
-        "=r"(pad_top),      // %7
-        "=r"(pad_left),     // %8
-        "=r"(resi_h),       // %9
-        "=r"(resi_w)        // %10
-        :"0"(inp_ptr),
-        "1"(pad_ptr),
-        "2"(inc),
-        "3"(inh),
-        "4"(inw),
-        "5"(padded_hw),
-        "6"(padded_w),
-        "7"(pad_top),
-        "8"(pad_left),
-        "9"(resi_h),
-        "10"(resi_w)
-        :"cc", "memory", "v2", "v3", "v4", "v5",
-         "t0", "t1", "t2", "t3", "t4", "t5"
+        : "=r"(inp_ptr),    // %0
+          "=r"(pad_ptr),    // %1
+          "=r"(inc),        // %2
+          "=r"(inh),        // %3
+          "=r"(inw),        // %4
+          "=r"(padded_hw),  // %5
+          "=r"(padded_w),   // %6
+          "=r"(pad_top),    // %7
+          "=r"(pad_left),   // %8
+          "=r"(resi_h),     // %9
+          "=r"(resi_w)      // %10
+        : "0"(inp_ptr), "1"(pad_ptr), "2"(inc), "3"(inh), "4"(inw), "5"(padded_hw), "6"(padded_w),
+          "7"(pad_top), "8"(pad_left), "9"(resi_h), "10"(resi_w)
+        : "cc", "memory", "v2", "v3", "v4", "v5", "t0", "t1", "t2", "t3", "t4", "t5"
 
     );
 }
 
-
 /*  params:
     output_trans:   transflorm output after dot
     output：        final output data
@@ -318,11 +294,10 @@ void shl_c906_crop_output(float *output_trans, float *output, int out_c, int out
     int resi_h = wino_h - out_h;
     int resi_w = wino_w - out_w;
     float *out_ptr = output;
-    for(int c = 0; c < out_c; c++) {
-
+    for (int c = 0; c < out_c; c++) {
         float *crop_ptr = output_trans + c * wino_h * wino_w;
 
-        for(int h = 0; h < out_h; h++) {
+        for (int h = 0; h < out_h; h++) {
             memcpy(out_ptr, crop_ptr, out_w * sizeof(float));
             out_ptr += out_w;
             crop_ptr += wino_w;
@@ -336,11 +311,10 @@ void shl_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c,
     int resi_h = wino_h - out_h;
     int resi_w = wino_w - out_w;
     __fp16 *out_ptr = output;
-    for(int c = 0; c < out_c; c++) {
-
+    for (int c = 0; c < out_c; c++) {
         __fp16 *crop_ptr = output_trans + c * wino_h * wino_w;
 
-        for(int h = 0; h < out_h; h++) {
+        for (int h = 0; h < out_h; h++) {
             memcpy(out_ptr, crop_ptr, out_w * sizeof(__fp16));
             out_ptr += out_w;
             crop_ptr += wino_w;
@@ -348,7 +322,6 @@ void shl_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c,
     }
 }
 
-
 /*
     fcsr: float control status register
     bit region:
@@ -372,12 +345,10 @@ void shl_c906_reset_fcsr() { asm volatile("csrrw x0, fcsr, zero\n\t" : : : "memo
 int shl_c906_get_fcsr()
 {
     int f_flag = 0;
-    asm volatile(
-        "csrrs %0, fcsr, zero\n\t"
+    asm volatile("csrrs %0, fcsr, zero\n\t"
 
-        :"=r"(f_flag)
-        :
-        :"memory"
-    );
+                 : "=r"(f_flag)
+                 :
+                 : "memory");
     return f_flag;
 }
diff --git a/source/c908_opt/CMakeLists.txt b/source/c908_opt/CMakeLists.txt
index fb8ceab4..c1d8193f 100644
--- a/source/c908_opt/CMakeLists.txt
+++ b/source/c908_opt/CMakeLists.txt
@@ -34,37 +34,37 @@ if(CONFIG_C908_CONVOLUTION_FP32)
 endif()
 
 if(CONFIG_C908_CONVOLUTION_FP16)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16_pack1ton.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16_packn.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16_packnto1.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_3x3_fp16_packn.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_3x3_fp16.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16_pack1ton.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16_packn.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16_packnto1.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16.c)
-list(APPEND C908_SRCS source/c908_opt/fp16/convolution.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16_pack1ton.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16_packn.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16_packnto1.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_1x1_fp16.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_3x3_fp16_packn.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_3x3_fp16.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16_pack1ton.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16_packn.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16_packnto1.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution_gemm_fp16.c)
+    list(APPEND C908_SRCS source/c908_opt/fp16/convolution.c)
 endif()
 
 if(CONFIG_C908_CONVOLUTION_INT8)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8_pack1ton.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8_packn.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8_packnto1.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_3x3_int8_packn_1.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_3x3_int8_packn.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_3x3_int8.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8_pack1ton.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8_packn.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8_packnto1.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8.c)
-list(APPEND C908_SRCS source/c908_opt/int8/convolution.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8_pack1ton.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8_packn.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8_packnto1.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_1x1_int8.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_3x3_int8_packn_1.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_3x3_int8_packn.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_3x3_int8.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8_pack1ton.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8_packn.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8_packnto1.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution_gemm_int8.c)
+    list(APPEND C908_SRCS source/c908_opt/int8/convolution.c)
 endif()
 
 if(CONFIG_C908_CONVOLUTION_INT4)
-list(APPEND C908_SRCS source/c908_opt/int4/convolution.c)
+    list(APPEND C908_SRCS source/c908_opt/int4/convolution.c)
 endif()
 
 if(CONFIG_C908_DEPTHWISE_CONVOLUTION_FP32)
diff --git a/source/c908_opt/fp16/avgpool.c b/source/c908_opt/fp16/avgpool.c
index e8520166..aa1d4d58 100644
--- a/source/c908_opt/fp16/avgpool.c
+++ b/source/c908_opt/fp16/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global avgpool2d
diff --git a/source/c908_opt/fp16/convolution.c b/source/c908_opt/fp16/convolution.c
index 60ced41c..8d4339f9 100644
--- a/source/c908_opt/fp16/convolution.c
+++ b/source/c908_opt/fp16/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -30,8 +30,8 @@ int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     const int packn = csrr_vlenb() / sizeof(__fp16);
@@ -48,20 +48,31 @@ int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
         if (shl_is_first_layer_input(input, sess)) {
             in_elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     // packn
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
-            shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv1x1s1_gemm_packn_fp16;
         } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-                   dalition_h == 1 && dalition_w == 1) {
-            if (params->group > 1) {
+                   dilation_h == 1 && dilation_w == 1) {
+            if (params->group > 1 || (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8)) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
-                shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                }
                 cb->exec = shl_c908_conv_im2col_gemm_packn_fp16;
                 return CSINN_TRUE;
             } else {
@@ -78,7 +89,11 @@ int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
             }
         } else {
             params->conv_extra.conv_mode = CSINN_GEMM;
-            shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv_im2col_gemm_packn_fp16;
         }
     }
@@ -86,12 +101,20 @@ int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     // pack1ton
     if (in_elempack % packn != 0 && out_elempack % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_fp16;
         } else {
-            shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv_im2col_gemm_pack1ton_fp16;
         }
     }
@@ -99,12 +122,20 @@ int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     // packnto1
     if (in_elempack % packn == 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv1x1s1_gemm_packnto1_fp16;
         } else {
-            shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv_im2col_gemm_packnto1_fp16;
         }
     }
@@ -112,12 +143,20 @@ int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     // pack1
     if (in_elempack % packn != 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv1x1s1_gemm_fp16;
         } else {
-            shl_c908_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_c908_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+            }
             cb->exec = shl_c908_conv_im2col_gemm_fp16;
         }
     }
diff --git a/source/c908_opt/fp16/convolution_1x1_fp16.c b/source/c908_opt/fp16/convolution_1x1_fp16.c
index 0c80f2d3..8a613149 100644
--- a/source/c908_opt/fp16/convolution_1x1_fp16.c
+++ b/source/c908_opt/fp16/convolution_1x1_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
                                                  struct csinn_conv2d_params *params)
@@ -39,52 +39,14 @@ int shl_c908_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor
                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                  struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-
     const int vlen = csrr_vlenb() * 8;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            __fp16 *pa = kernel_data + g * m * k;
-            __fp16 *pb = pb_reorder;
-            __fp16 *pc = output_data;
-            if (vlen == 128) {
-                // pack
-                shl_c908_reorder_input_z24_fp16(input_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x24_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            } else if (vlen >= 256) {
-                // pack
-                shl_c908_reorder_input_z32_fp16_v256(input_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x32_fp16_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            }
-
-            input_data += k * n;
-            output_data += m * n;
-        }
+    if (vlen == 128) {
+        return shl_rvv_common_conv1x1_gemm_fp16(input, output, kernel, bias, params,
+                                                shl_c908_reorder_input_z24_fp16,
+                                                shl_c908_gemm_8x24_fp16);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv1x1_gemm_fp16(input, output, kernel, bias, params,
+                                                shl_c908_reorder_input_z32_fp16_v256,
+                                                shl_c908_gemm_8x32_fp16_v256);
     }
-    shl_mem_free(pb_reorder);
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/fp16/convolution_1x1_fp16_pack1ton.c b/source/c908_opt/fp16/convolution_1x1_fp16_pack1ton.c
index a29d15c3..84a1192d 100644
--- a/source/c908_opt/fp16/convolution_1x1_fp16_pack1ton.c
+++ b/source/c908_opt/fp16/convolution_1x1_fp16_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -32,60 +32,7 @@ int shl_c908_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csi
                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                           struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(__fp16);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_c / group;
-    int32_t k = in_c / group;
-    int32_t n = out_h * out_w;
-
-    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            __fp16 *kernel_ptr = kernel_data + g * m * k;
-            __fp16 *in_ptr = pb_reorder;
-            __fp16 *out_ptr = output_data;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
-            shl_rvv_reorder_input_pack1ton_fp16(input_data, input_ncxhwx, k, out_h, out_w);
-
-            // reorder(pack)
-            shl_rvv_reorder_input_z12_pack1ton_fp16(input_ncxhwx, in_ptr, k, 1, n, n);
-
-            // gemm
-            shl_c908_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                                false);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(input_ncxhwx);
-    // requantize
-    shl_rvv_siso_op_requantize_fp16(input, output);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv1x1_gemm_pack1ton_fp16(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_fp16,
+                                                     shl_c908_ncxhwx_gemm_12xpack2n_fp16);
 }
diff --git a/source/c908_opt/fp16/convolution_1x1_fp16_packn.c b/source/c908_opt/fp16/convolution_1x1_fp16_packn.c
index 92c44baf..4074dd63 100644
--- a/source/c908_opt/fp16/convolution_1x1_fp16_packn.c
+++ b/source/c908_opt/fp16/convolution_1x1_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
                                                        struct csinn_conv2d_params *params)
@@ -28,52 +28,7 @@ int shl_c908_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                        struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(__fp16);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            __fp16 *kernel_ptr = kernel_data + g * m * k;
-            __fp16 *in_ptr = pb_reorder;
-            __fp16 *out_ptr = output_data;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            // pack
-            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
-            // GEMM
-            shl_c908_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                                false);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv1x1_gemm_packn_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp16,
+                                                  shl_c908_ncxhwx_gemm_12xpack2n_fp16);
 }
diff --git a/source/c908_opt/fp16/convolution_1x1_fp16_packnto1.c b/source/c908_opt/fp16/convolution_1x1_fp16_packnto1.c
index 81d3f0fb..754cea1a 100644
--- a/source/c908_opt/fp16/convolution_1x1_fp16_packnto1.c
+++ b/source/c908_opt/fp16/convolution_1x1_fp16_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
                                                           struct csinn_conv2d_params *params)
@@ -28,50 +28,7 @@ int shl_c908_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csi
                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                           struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-    __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            __fp16 *kernel_ptr = kernel_data + g * m * k;
-            __fp16 *in_ptr = pb_reorder;
-            __fp16 *out_ptr = output_data;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
-            // pack
-            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
-            // GEMM
-            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k,
-                                                n, false);
-
-            shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(output_ncxhwx);
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv1x1_gemm_packnto1_fp16(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_fp16,
+                                                     shl_c908_ncxhwx_gemm_12xpack2n_fp16);
 }
diff --git a/source/c908_opt/fp16/convolution_3x3_fp16.c b/source/c908_opt/fp16/convolution_3x3_fp16.c
index e8fd6aa2..8a09883d 100644
--- a/source/c908_opt/fp16/convolution_3x3_fp16.c
+++ b/source/c908_opt/fp16/convolution_3x3_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
     note: VLEN = 128
diff --git a/source/c908_opt/fp16/convolution_3x3_fp16_packn.c b/source/c908_opt/fp16/convolution_3x3_fp16_packn.c
index 04f35a43..3029862c 100644
--- a/source/c908_opt/fp16/convolution_3x3_fp16_packn.c
+++ b/source/c908_opt/fp16/convolution_3x3_fp16_packn.c
@@ -17,7 +17,7 @@
  */
 #ifdef NNN
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c b/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c
index 30aa5bdd..a5822315 100644
--- a/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c
+++ b/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c
@@ -17,7 +17,7 @@
  */
 // #ifdef NNN
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/c908_opt/fp16/convolution_gemm_fp16.c b/source/c908_opt/fp16/convolution_gemm_fp16.c
index 1e4a942a..9ab6efa8 100644
--- a/source/c908_opt/fp16/convolution_gemm_fp16.c
+++ b/source/c908_opt/fp16/convolution_gemm_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -43,94 +43,14 @@ int shl_c908_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tens
                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                    struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_ch = input->dim[1];
-    int32_t in_height = input->dim[2];
-    int32_t in_width = input->dim[3];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_height = output->dim[2];
-    int32_t out_width = output->dim[3];
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t pad_left = params->pad_left;
-    int32_t pad_top = params->pad_top;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group * ksize_h * ksize_w;
-    int32_t n = out_height * out_width;
-
-    __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-
     const int vlen = csrr_vlenb() * 8;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // im2col
-            __fp16 *data_col = im2col_data;
-            __fp16 *channel_data = input_data;
-            for (int c = 0; c < in_ch / group; c++) {
-                for (int kh = 0; kh < ksize_h; kh++) {
-                    for (int kw = 0; kw < ksize_w; kw++) {
-                        int in_row = -pad_top + kh * dilation_h;
-                        for (int oh = 0; oh < out_height; oh++) {
-                            if (in_row >= in_height || in_row < 0) {
-                                for (int ow = 0; ow < out_width; ow++) {
-                                    *data_col++ = 0.0f;
-                                }
-                            } else {
-                                int in_col = -pad_left + kw * dilation_w;
-                                for (int ow1 = 0; ow1 < out_width; ow1++) {
-                                    int col_idx = (c * out_height + oh) * out_width + ow1;
-                                    if (in_col < in_width && in_col >= 0) {
-                                        *data_col++ = channel_data[in_row * in_width + in_col];
-                                    } else {
-                                        *data_col++ = 0.0f;
-                                    }
-                                    in_col += stride_w;
-                                }
-                            }
-                            in_row += stride_h;
-                        }
-                    }
-                }
-                channel_data += in_height * in_width;
-            }
-
-            __fp16 *pa = kernel_data + g * m * k;
-            __fp16 *pb = pb_reorder;
-            __fp16 *pc = output_data;
-            if (vlen == 128) {
-                // pack
-                shl_c908_reorder_input_z24_fp16(im2col_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x24_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            } else if (vlen >= 256) {
-                // pack
-                shl_c908_reorder_input_z32_fp16_v256(im2col_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x32_fp16_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            }
-            input_data += in_ch / group * in_height * in_width;
-            output_data += m * n;
-        }
+    if (vlen == 128) {
+        return shl_rvv_common_conv_gemm_fp16(input, output, kernel, bias, params,
+                                             shl_c908_reorder_input_z24_fp16,
+                                             shl_c908_gemm_8x24_fp16);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv_gemm_fp16(input, output, kernel, bias, params,
+                                             shl_c908_reorder_input_z32_fp16_v256,
+                                             shl_c908_gemm_8x32_fp16_v256);
     }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(im2col_data);
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/fp16/convolution_gemm_fp16_pack1ton.c b/source/c908_opt/fp16/convolution_gemm_fp16_pack1ton.c
index cc673706..994d9ac1 100644
--- a/source/c908_opt/fp16/convolution_gemm_fp16_pack1ton.c
+++ b/source/c908_opt/fp16/convolution_gemm_fp16_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -32,109 +32,7 @@ int shl_c908_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct c
                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                             struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(__fp16);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
-            shl_rvv_pad_input_pack1ton_fp16(input_data, input_pad_buf, in_cp, in_h, in_w,
-                                            padded_in_h, padded_in_w, params->pad_top,
-                                            params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(__fp16);
-            int vl = vsetvl_e16m1(packn);
-
-            // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
-            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w);
-
-            const __fp16 *img0 = input_pad_buf;
-            __fp16 *dst_ptr = im2col_buf;
-
-            int loop_c = in_cp;
-            while (loop_c > 0) {
-                vl = vsetvl_e16m1(loop_c);
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const __fp16 *img1 =
-                            img0 + a * dilation_h * padded_in_w * vl + b * dilation_w * vl;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
-                                img1 += stride_w * vl;
-                                vse16_v_f16m1(dst_ptr, _tmp, vl);
-                                dst_ptr += vl;
-                            }
-                            img1 += tailstep * vl;
-                        }
-                    }
-                }
-                img0 += padded_in_hw * vl;
-                // dst_ptr += maxk * out_h * out_w * vl;
-                loop_c -= vl;
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
-            shl_rvv_reorder_input_z12_pack1ton_fp16(im2col_buf, reorder_buf, in_cp, maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            // shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-            //                                    in_cp * maxk, n, n);
-            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                in_cp * maxk, n, false);
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv_gemm_pack1ton_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_fp16,
+                                                  shl_c908_ncxhwx_gemm_12xpack2n_fp16);
 }
diff --git a/source/c908_opt/fp16/convolution_gemm_fp16_packn.c b/source/c908_opt/fp16/convolution_gemm_fp16_packn.c
index 32d81e5f..60c09c3f 100644
--- a/source/c908_opt/fp16/convolution_gemm_fp16_packn.c
+++ b/source/c908_opt/fp16/convolution_gemm_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * packn = vlenb / sizeof(__fp16)
@@ -37,102 +37,7 @@ int shl_c908_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csin
                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                          struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(__fp16);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_w * padded_in_h;
-            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
-            shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(__fp16);
-            const int vl = vsetvl_e16m1(packn);
-
-            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                         packn * sizeof(__fp16));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const __fp16 *img0 = input_pad_buf + c * padded_in_hw;
-                __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const __fp16 *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
-                                img1 += stride_w * packn;
-                                vse16_v_f16m1(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            __fp16 *reorder_buf =
-                (__fp16 *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(__fp16));
-            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                in_cp * maxk, n, false);
-
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv_gemm_packn_fp16(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_fp16,
+                                               shl_c908_ncxhwx_gemm_12xpack2n_fp16);
 }
diff --git a/source/c908_opt/fp16/convolution_gemm_fp16_packnto1.c b/source/c908_opt/fp16/convolution_gemm_fp16_packnto1.c
index 1e48ea08..6283b47b 100644
--- a/source/c908_opt/fp16/convolution_gemm_fp16_packnto1.c
+++ b/source/c908_opt/fp16/convolution_gemm_fp16_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -32,96 +32,7 @@ int shl_c908_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct c
                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                             struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
-            shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(__fp16);
-            const int vl = vsetvl_e16m1(packn);
-
-            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                         packn * sizeof(__fp16));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const __fp16 *img0 = input_pad_buf + c * padded_in_hw;
-                __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const __fp16 *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
-                                img1 += stride_w * packn;
-                                vse16_v_f16m1(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
-            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                                in_cp * maxk, n, false);
-            shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(output_ncxhwx);
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv_gemm_packnto1_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp16,
+                                                  shl_c908_ncxhwx_gemm_12xpack2n_fp16);
 }
diff --git a/source/c908_opt/fp16/depthwise_convolution.c b/source/c908_opt/fp16/depthwise_convolution.c
index ea3c7d81..ac6d87d8 100644
--- a/source/c908_opt/fp16/depthwise_convolution.c
+++ b/source/c908_opt/fp16/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -46,6 +46,9 @@ int shl_c908_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn
             in_elempack = 1;
             out_elempack = 1;  // dwconv2d out_channel pack is same as in_channel
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
diff --git a/source/c908_opt/fp16/fullyconnected.c b/source/c908_opt/fp16/fullyconnected.c
index 437aa91d..fe4b4a23 100644
--- a/source/c908_opt/fp16/fullyconnected.c
+++ b/source/c908_opt/fp16/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_fullyconnected_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/c908_opt/fp16/gemm_fp16.c b/source/c908_opt/fp16/gemm_fp16.c
index edb6820a..db4a8fd1 100644
--- a/source/c908_opt/fp16/gemm_fp16.c
+++ b/source/c908_opt/fp16/gemm_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: VLEN = 128
diff --git a/source/c908_opt/fp16/gemm_fp16_ncxhwx.S b/source/c908_opt/fp16/gemm_fp16_ncxhwx.S
index 412354f2..4cbc4ca8 100644
--- a/source/c908_opt/fp16/gemm_fp16_ncxhwx.S
+++ b/source/c908_opt/fp16/gemm_fp16_ncxhwx.S
@@ -132,6 +132,8 @@ pack2nx12_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx12_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx12_k2_end
 
 pack2nx12_k2:
     vle16.v         v5, (t3)
@@ -222,10 +224,98 @@ pack2nx12_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx12_k2
 
-pack2nx12_k1:
+pack2nx12_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 12(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 14(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 16(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 18(a2)
+    vfmacc.vf       v12, ft4, v3
+    vfmacc.vf       v24, ft4, v4
+    flh             fa4, 20(a2)
+    vfmacc.vf       v13, ft5, v3
+    vfmacc.vf       v25, ft5, v4
+    flh             fa5, 22(a2)
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v26, fa0, v4
+    flh             ft0, 24(a2)
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v27, fa1, v4
+    flh             ft1, 26(a2)
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v28, fa2, v4
+    flh             ft2, 28(a2)
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v29, fa3, v4
+    flh             ft3, 30(a2)
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v30, fa4, v4
+    flh             ft4, 32(a2)
+    vfmacc.vf       v19, fa5, v3
+    vfmacc.vf       v31, fa5, v4
+    flh             ft5, 34(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flh             fa0, 36(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flh             fa1, 38(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flh             fa2, 40(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flh             fa3, 42(a2)
+    vfmacc.vf       v12, ft4, v5
+    vfmacc.vf       v24, ft4, v6
+    flh             fa4, 44(a2)
+    vfmacc.vf       v13, ft5, v5
+    vfmacc.vf       v25, ft5, v6
+    flh             fa5, 46(a2)
+    addi            a2, a2, 48
+    vfmacc.vf       v14, fa0, v5
+    vfmacc.vf       v26, fa0, v6
+    vfmacc.vf       v15, fa1, v5
+    vfmacc.vf       v27, fa1, v6
+    vfmacc.vf       v16, fa2, v5
+    vfmacc.vf       v28, fa2, v6
+    vfmacc.vf       v17, fa3, v5
+    vfmacc.vf       v29, fa3, v6
+    vfmacc.vf       v18, fa4, v5
+    vfmacc.vf       v30, fa4, v6
+    vfmacc.vf       v19, fa5, v5
+    vfmacc.vf       v31, fa5, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx12_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+    flh             ft4, 8(a2)
+    flh             ft5, 10(a2)
+
+pack2nx12_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     flh             fa0, 12(a2)
@@ -377,6 +467,8 @@ pack2nx8_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx8_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx8_k2_end
 
 pack2nx8_k2:
     vle16.v         v5, (t3)
@@ -443,10 +535,74 @@ pack2nx8_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx8_k2
 
-pack2nx8_k1:
+pack2nx8_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 14(a2)
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v24, fa0, v4
+    flh             ft0, 16(a2)
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v25, fa1, v4
+    flh             ft1, 18(a2)
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v26, fa2, v4
+    flh             ft2, 20(a2)
+    vfmacc.vf       v15, fa3, v3
+    vfmacc.vf       v27, fa3, v4
+    flh             ft3, 22(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flh             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flh             fa1, 26(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flh             fa2, 28(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flh             fa3, 30(a2)
+    addi            a2, a2, 32
+    vfmacc.vf       v12, fa0, v5
+    vfmacc.vf       v24, fa0, v6
+    vfmacc.vf       v13, fa1, v5
+    vfmacc.vf       v25, fa1, v6
+    vfmacc.vf       v14, fa2, v5
+    vfmacc.vf       v26, fa2, v6
+    vfmacc.vf       v15, fa3, v5
+    vfmacc.vf       v27, fa3, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx8_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+pack2nx8_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     flh             fa0, 8(a2)
@@ -551,6 +707,8 @@ pack2nx4_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx4_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx4_k2_end
 
 pack2nx4_k2:
     vle16.v         v5, (t3)
@@ -593,10 +751,55 @@ pack2nx4_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx4_k2
 
-pack2nx4_k1:
+pack2nx4_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 14(a2)
+    addi            a2, a2, 16
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+    vfmacc.vf       v10, fa2, v5
+    vfmacc.vf       v22, fa2, v6
+    vfmacc.vf       v11, fa3, v5
+    vfmacc.vf       v23, fa3, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx4_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+pack2nx4_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     vfmacc.vf       v9, ft1, v3
@@ -660,6 +863,8 @@ pack2nx2_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx2_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx2_k2_end
 
 pack2nx2_k2:
     vle16.v         v5, (t3)
@@ -690,10 +895,38 @@ pack2nx2_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx2_k2
 
-pack2nx2_k1:
+pack2nx2_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 4(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 6(a2)
+    addi            a2, a2, 8
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx2_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+
+pack2nx2_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     vfmacc.vf       v9, ft1, v3
@@ -736,6 +969,8 @@ pack2nx1_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx1_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx1_k2_end
 
 pack2nx1_k2:
     vle16.v         v5, (t3)
@@ -760,10 +995,32 @@ pack2nx1_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx1_k2
 
-pack2nx1_k1:
+pack2nx1_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 2(a2)
+    addi            a2, a2, 4
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx1_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+
+pack2nx1_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     addi            a2, a2, 2
@@ -870,6 +1127,8 @@ packnx12_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx12_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx12_k2_end
 
 packnx12_k2:
     vle16.v         v5, (t3)
@@ -932,10 +1191,70 @@ packnx12_k2:
     addi            t2, t2, -1
     bnez            t2, packnx12_k2
 
-packnx12_k1:
+packnx12_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 12(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 14(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 16(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 18(a2)
+    vfmacc.vf       v12, ft4, v3
+    flh             fa4, 20(a2)
+    vfmacc.vf       v13, ft5, v3
+    flh             fa5, 22(a2)
+    vfmacc.vf       v14, fa0, v3
+    flh             ft0, 24(a2)
+    vfmacc.vf       v15, fa1, v3
+    flh             ft1, 26(a2)
+    vfmacc.vf       v16, fa2, v3
+    flh             ft2, 28(a2)
+    vfmacc.vf       v17, fa3, v3
+    flh             ft3, 30(a2)
+    vfmacc.vf       v18, fa4, v3
+    flh             ft4, 32(a2)
+    vfmacc.vf       v19, fa5, v3
+    flh             ft5, 34(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    flh             fa0, 36(a2)
+    vfmacc.vf       v9, ft1, v5
+    flh             fa1, 38(a2)
+    vfmacc.vf       v10, ft2, v5
+    flh             fa2, 40(a2)
+    vfmacc.vf       v11, ft3, v5
+    flh             fa3, 42(a2)
+    vfmacc.vf       v12, ft4, v5
+    flh             fa4, 44(a2)
+    vfmacc.vf       v13, ft5, v5
+    flh             fa5, 46(a2)
+    addi            a2, a2, 48
+    vfmacc.vf       v14, fa0, v5
+    vfmacc.vf       v15, fa1, v5
+    vfmacc.vf       v16, fa2, v5
+    vfmacc.vf       v17, fa3, v5
+    vfmacc.vf       v18, fa4, v5
+    vfmacc.vf       v19, fa5, v5
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx12_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+    flh             ft4, 8(a2)
+    flh             ft5, 10(a2)
+
+packnx12_k1:
     vfmacc.vf       v8, ft0, v3
     flh             fa0, 12(a2)
     vfmacc.vf       v9, ft1, v3
@@ -1026,6 +1345,8 @@ packnx8_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx8_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx8_k2_end
 
 packnx8_k2:
     vle16.v         v5, (t3)
@@ -1072,10 +1393,54 @@ packnx8_k2:
     addi            t2, t2, -1
     bnez            t2, packnx8_k2
 
-packnx8_k1:
+packnx8_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 14(a2)
+    vfmacc.vf       v12, fa0, v3
+    flh             ft0, 16(a2)
+    vfmacc.vf       v13, fa1, v3
+    flh             ft1, 18(a2)
+    vfmacc.vf       v14, fa2, v3
+    flh             ft2, 20(a2)
+    vfmacc.vf       v15, fa3, v3
+    flh             ft3, 22(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    flh             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v5
+    flh             fa1, 26(a2)
+    vfmacc.vf       v10, ft2, v5
+    flh             fa2, 28(a2)
+    vfmacc.vf       v11, ft3, v5
+    flh             fa3, 30(a2)
+    addi            a2, a2, 32
+    vfmacc.vf       v12, fa0, v5
+    vfmacc.vf       v13, fa1, v5
+    vfmacc.vf       v14, fa2, v5
+    vfmacc.vf       v15, fa3, v5
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx8_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+packnx8_k1:
     vfmacc.vf       v8, ft0, v3
     flh             fa0, 8(a2)
     vfmacc.vf       v9, ft1, v3
@@ -1141,6 +1506,8 @@ packnx4_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx4_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx4_k2_end
 
 packnx4_k2:
     vle16.v         v5, (t3)
@@ -1171,10 +1538,38 @@ packnx4_k2:
     addi            t2, t2, -1
     bnez            t2, packnx4_k2
 
-packnx4_k1:
+packnx4_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 14(a2)
+    addi            a2, a2, 16
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v10, fa2, v5
+    vfmacc.vf       v11, fa3, v5
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx4_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+packnx4_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v9, ft1, v3
     vfmacc.vf       v10, ft2, v3
@@ -1215,6 +1610,8 @@ packnx2_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx2_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx2_k2_end
 
 packnx2_k2:
     vle16.v         v5, (t3)
@@ -1237,10 +1634,30 @@ packnx2_k2:
     addi            t2, t2, -1
     bnez            t2, packnx2_k2
 
-packnx2_k1:
+packnx2_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 4(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 6(a2)
+    addi            a2, a2, 8
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v9, fa1, v5
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx2_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+
+packnx2_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v9, ft1, v3
     addi            a2, a2, 4
@@ -1271,6 +1688,8 @@ packnx1_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx1_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx1_k2_end
 
 packnx1_k2:
     vle16.v         v5, (t3)
@@ -1287,10 +1706,25 @@ packnx1_k2:
     addi            t2, t2, -1
     bnez            t2, packnx1_k2
 
-packnx1_k1:
+packnx1_k2_end:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 2(a2)
+    addi            a2, a2, 4
+
+    vfmacc.vf       v8, fa0, v5
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx1_relu
 
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+
+packnx1_k1:
     vfmacc.vf       v8, ft0, v3
     addi            a2, a2, 2
 
diff --git a/source/c908_opt/fp16/gemm_fp16_packn.c b/source/c908_opt/fp16/gemm_fp16_packn.c
index 41c4564c..3130cec0 100644
--- a/source/c908_opt/fp16/gemm_fp16_packn.c
+++ b/source/c908_opt/fp16/gemm_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void gemm_fp16_ncxhwx_12xpack2n(__fp16 *output, const __fp16 *kernel, const __fp16 *input,
                                 const __fp16 *bias, int m, int k, int n, bool fuse_relu);
@@ -24,7 +24,7 @@ void gemm_fp16_ncxhwx_12xpackn(__fp16 *output, const __fp16 *kernel, const __fp1
                                const __fp16 *bias, int m, int k, int n, bool fuse_relu);
 
 void shl_c908_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                         const __fp16 *bias, int m, int k, int n, bool fuse_relu)
+                                         __fp16 *bias, int m, int k, int n, bool fuse_relu)
 {
     const int packn = csrr_vlenb() / sizeof(__fp16);
     const int pack2n = packn * 2;
diff --git a/source/c908_opt/fp16/gemm_fp16_v256.c b/source/c908_opt/fp16/gemm_fp16_v256.c
index 0f2bb781..9f26a8a0 100644
--- a/source/c908_opt/fp16/gemm_fp16_v256.c
+++ b/source/c908_opt/fp16/gemm_fp16_v256.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: VLEN = 256
diff --git a/source/c908_opt/fp16/maxpool.c b/source/c908_opt/fp16/maxpool.c
index 8d3b0473..38ebbfa9 100644
--- a/source/c908_opt/fp16/maxpool.c
+++ b/source/c908_opt/fp16/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global maxpool2d // TODO: remove
diff --git a/source/c908_opt/fp32/avgpool.c b/source/c908_opt/fp32/avgpool.c
index 789bc6d3..29ec72f1 100644
--- a/source/c908_opt/fp32/avgpool.c
+++ b/source/c908_opt/fp32/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global avgpool2d
diff --git a/source/c908_opt/fp32/convolution.c b/source/c908_opt/fp32/convolution.c
index 1f845ae1..231e61a8 100644
--- a/source/c908_opt/fp32/convolution.c
+++ b/source/c908_opt/fp32/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -30,8 +30,8 @@ int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     const int packn = csrr_vlenb() / sizeof(float);
@@ -48,17 +48,20 @@ int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
         if (shl_is_first_layer_input(input, sess)) {
             in_elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     // packn
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_packn_fp32;
         } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-                   dalition_h == 1 && dalition_w == 1) {
+                   dilation_h == 1 && dilation_w == 1) {
             if (params->group > 1) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
                 shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
@@ -86,8 +89,8 @@ int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
     // pack1ton
     if (in_elempack % packn != 0 && out_elempack % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_fp32;
         } else {
@@ -99,8 +102,8 @@ int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
     // packnto1
     if (in_elempack % packn == 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_packnto1_fp32;
         } else {
@@ -112,8 +115,8 @@ int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o
     // pack1
     if (in_elempack % packn != 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_fp32;
         } else {
diff --git a/source/c908_opt/fp32/convolution_1x1_fp32.c b/source/c908_opt/fp32/convolution_1x1_fp32.c
index 910ba293..c819d723 100644
--- a/source/c908_opt/fp32/convolution_1x1_fp32.c
+++ b/source/c908_opt/fp32/convolution_1x1_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
                                                  struct csinn_conv2d_params *params)
@@ -39,50 +39,14 @@ int shl_c908_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor
                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                  struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
-
     const int vlen = csrr_vlenb() * 8;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            float *pa = kernel_data + g * m * k;
-            float *pb = pb_reorder;
-            float *pc = output_data;
-            if (vlen == 128) {
-                // pack
-                shl_c908_reorder_input_z12_fp32(input_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x12_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            } else if (vlen >= 256) {
-                // pack
-                shl_c908_reorder_input_z16_fp32_v256(input_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x16_fp32_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            }
-
-            input_data += k * n;
-            output_data += m * n;
-        }
+    if (vlen == 128) {
+        return shl_rvv_common_conv1x1_gemm_fp32(input, output, kernel, bias, params,
+                                                shl_c908_reorder_input_z12_fp32,
+                                                shl_c908_gemm_8x12_fp32);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv1x1_gemm_fp32(input, output, kernel, bias, params,
+                                                shl_c908_reorder_input_z16_fp32_v256,
+                                                shl_c908_gemm_8x16_fp32_v256);
     }
-    shl_mem_free(pb_reorder);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/fp32/convolution_1x1_fp32_pack1ton.c b/source/c908_opt/fp32/convolution_1x1_fp32_pack1ton.c
index 9f95e610..caaa6620 100644
--- a/source/c908_opt/fp32/convolution_1x1_fp32_pack1ton.c
+++ b/source/c908_opt/fp32/convolution_1x1_fp32_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -32,60 +32,7 @@ int shl_c908_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csi
                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                           struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(float);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_c / group;
-    int32_t k = in_c / group;
-    int32_t n = out_h * out_w;
-
-    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
-    float *input_ncxhwx = (float *)shl_mem_alloc(k * n * sizeof(float));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            float *kernel_ptr = kernel_data + g * m * k;
-            float *in_ptr = pb_reorder;
-            float *out_ptr = output_data;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
-            shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w);
-
-            // reorder(pack)
-            shl_rvv_reorder_input_z12_pack1ton_fp32(input_ncxhwx, in_ptr, k, 1, n, n);
-
-            // gemm
-            // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-            // n);
-            shl_c908_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                                false);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(input_ncxhwx);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv1x1_gemm_pack1ton_fp32(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_fp32,
+                                                     shl_c908_ncxhwx_gemm_12xpack2n_fp32);
 }
diff --git a/source/c908_opt/fp32/convolution_1x1_fp32_packn.c b/source/c908_opt/fp32/convolution_1x1_fp32_packn.c
index 2e7046d8..331960f2 100644
--- a/source/c908_opt/fp32/convolution_1x1_fp32_packn.c
+++ b/source/c908_opt/fp32/convolution_1x1_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
                                                        struct csinn_conv2d_params *params)
@@ -28,51 +28,7 @@ int shl_c908_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                        struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(float);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            float *kernel_ptr = kernel_data + g * m * k;
-            float *in_ptr = pb_reorder;
-            float *out_ptr = output_data;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
-            // pack
-            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
-            // GEMM
-            shl_c908_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                                false);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv1x1_gemm_packn_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp32,
+                                                  shl_c908_ncxhwx_gemm_12xpack2n_fp32);
 }
diff --git a/source/c908_opt/fp32/convolution_1x1_fp32_packnto1.c b/source/c908_opt/fp32/convolution_1x1_fp32_packnto1.c
index 097d31d9..7ae566d7 100644
--- a/source/c908_opt/fp32/convolution_1x1_fp32_packnto1.c
+++ b/source/c908_opt/fp32/convolution_1x1_fp32_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
                                                           struct csinn_conv2d_params *params)
@@ -28,48 +28,7 @@ int shl_c908_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csi
                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                           struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
-    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            float *kernel_ptr = kernel_data + g * m * k;
-            float *in_ptr = pb_reorder;
-            float *out_ptr = output_data;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
-            // pack
-            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
-            // GEMM
-            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k,
-                                                n, false);
-
-            shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(output_ncxhwx);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv1x1_gemm_packnto1_fp32(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_fp32,
+                                                     shl_c908_ncxhwx_gemm_12xpack2n_fp32);
 }
diff --git a/source/c908_opt/fp32/convolution_3x3_fp32.c b/source/c908_opt/fp32/convolution_3x3_fp32.c
index 6d5f3602..d0e885c5 100644
--- a/source/c908_opt/fp32/convolution_3x3_fp32.c
+++ b/source/c908_opt/fp32/convolution_3x3_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
     note: VLEN = 128
diff --git a/source/c908_opt/fp32/convolution_3x3_fp32_packn.c b/source/c908_opt/fp32/convolution_3x3_fp32_packn.c
index bc597520..f41ce511 100644
--- a/source/c908_opt/fp32/convolution_3x3_fp32_packn.c
+++ b/source/c908_opt/fp32/convolution_3x3_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 #ifdef NNN
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c b/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c
index 2181e4ff..3766733a 100644
--- a/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c
+++ b/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/c908_opt/fp32/convolution_gemm_fp32.c b/source/c908_opt/fp32/convolution_gemm_fp32.c
index 6486ccb5..8c2dbda8 100644
--- a/source/c908_opt/fp32/convolution_gemm_fp32.c
+++ b/source/c908_opt/fp32/convolution_gemm_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -43,92 +43,14 @@ int shl_c908_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tens
                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                    struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_ch = input->dim[1];
-    int32_t in_height = input->dim[2];
-    int32_t in_width = input->dim[3];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_height = output->dim[2];
-    int32_t out_width = output->dim[3];
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t pad_left = params->pad_left;
-    int32_t pad_top = params->pad_top;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group * ksize_h * ksize_w;
-    int32_t n = out_height * out_width;
-
-    float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float));
-    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
-
     const int vlen = csrr_vlenb() * 8;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // im2col
-            float *data_col = im2col_data;
-            float *channel_data = input_data;
-            for (int c = 0; c < in_ch / group; c++) {
-                for (int kh = 0; kh < ksize_h; kh++) {
-                    for (int kw = 0; kw < ksize_w; kw++) {
-                        int in_row = -pad_top + kh * dilation_h;
-                        for (int oh = 0; oh < out_height; oh++) {
-                            if (in_row >= in_height || in_row < 0) {
-                                for (int ow = 0; ow < out_width; ow++) {
-                                    *data_col++ = 0.0f;
-                                }
-                            } else {
-                                int in_col = -pad_left + kw * dilation_w;
-                                for (int ow1 = 0; ow1 < out_width; ow1++) {
-                                    int col_idx = (c * out_height + oh) * out_width + ow1;
-                                    if (in_col < in_width && in_col >= 0) {
-                                        *data_col++ = channel_data[in_row * in_width + in_col];
-                                    } else {
-                                        *data_col++ = 0.0f;
-                                    }
-                                    in_col += stride_w;
-                                }
-                            }
-                            in_row += stride_h;
-                        }
-                    }
-                }
-                channel_data += in_height * in_width;
-            }
-
-            float *pa = kernel_data + g * m * k;
-            float *pb = pb_reorder;
-            float *pc = output_data;
-            if (vlen == 128) {
-                // pack
-                shl_c908_reorder_input_z12_fp32(im2col_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x12_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            } else if (vlen >= 256) {
-                // pack
-                shl_c908_reorder_input_z16_fp32_v256(im2col_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x16_fp32_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
-            }
-            input_data += in_ch / group * in_height * in_width;
-            output_data += m * n;
-        }
+    if (vlen == 128) {
+        return shl_rvv_common_conv_gemm_fp32(input, output, kernel, bias, params,
+                                             shl_c908_reorder_input_z12_fp32,
+                                             shl_c908_gemm_8x12_fp32);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv_gemm_fp32(input, output, kernel, bias, params,
+                                             shl_c908_reorder_input_z16_fp32_v256,
+                                             shl_c908_gemm_8x16_fp32_v256);
     }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(im2col_data);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/fp32/convolution_gemm_fp32_pack1ton.c b/source/c908_opt/fp32/convolution_gemm_fp32_pack1ton.c
index dd8ac83e..0bcf9a78 100644
--- a/source/c908_opt/fp32/convolution_gemm_fp32_pack1ton.c
+++ b/source/c908_opt/fp32/convolution_gemm_fp32_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -32,107 +32,7 @@ int shl_c908_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct c
                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                             struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(float);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
-            shl_rvv_pad_input_pack1ton_fp32(input_data, input_pad_buf, in_cp, in_h, in_w,
-                                            padded_in_h, padded_in_w, params->pad_top,
-                                            params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(float);
-            int vl = vsetvl_e32m1(packn);
-
-            // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
-            float *im2col_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w);
-
-            const float *img0 = input_pad_buf;
-            float *dst_ptr = im2col_buf;
-
-            int loop_c = in_cp;
-            while (loop_c > 0) {
-                vl = vsetvl_e32m1(loop_c);
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const float *img1 =
-                            img0 + a * dilation_h * padded_in_w * vl + b * dilation_w * vl;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
-                                img1 += stride_w * vl;
-                                vse32_v_f32m1(dst_ptr, _tmp, vl);
-                                dst_ptr += vl;
-                            }
-                            img1 += tailstep * vl;
-                        }
-                    }
-                }
-                img0 += padded_in_hw * vl;
-                // dst_ptr += maxk * out_h * out_w * vl;
-                loop_c -= vl;
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
-            shl_rvv_reorder_input_z12_pack1ton_fp32(im2col_buf, reorder_buf, in_cp, maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-            //                                    in_cp * maxk, n, n);
-            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                in_cp * maxk, n, false);
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    return CSINN_TRUE;
+    return shl_rvv_common_conv_gemm_pack1ton_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_fp32,
+                                                  shl_c908_ncxhwx_gemm_12xpack2n_fp32);
 }
diff --git a/source/c908_opt/fp32/convolution_gemm_fp32_packn.c b/source/c908_opt/fp32/convolution_gemm_fp32_packn.c
index 42536704..fe005024 100644
--- a/source/c908_opt/fp32/convolution_gemm_fp32_packn.c
+++ b/source/c908_opt/fp32/convolution_gemm_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * packn = vlenb / sizeof(float)
@@ -37,103 +37,7 @@ int shl_c908_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csin
                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                          struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(float);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_w * padded_in_h;
-            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
-            shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(float);
-            const int vl = vsetvl_e32m1(packn);
-
-            // [in_c/packn, maxk, out_h, out_w, packn]
-            float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                       packn * sizeof(float));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const float *img0 = input_pad_buf + c * padded_in_hw;
-                float *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const float *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
-                                img1 += stride_w * packn;
-                                vse32_v_f32m1(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            float *reorder_buf =
-                (float *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(float));
-            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                in_cp * maxk, n, false);
-
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    return CSINN_TRUE;
+    return shl_rvv_common_conv_gemm_packn_fp32(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_fp32,
+                                               shl_c908_ncxhwx_gemm_12xpack2n_fp32);
 }
diff --git a/source/c908_opt/fp32/convolution_gemm_fp32_packnto1.c b/source/c908_opt/fp32/convolution_gemm_fp32_packnto1.c
index f5f611e2..45849399 100644
--- a/source/c908_opt/fp32/convolution_gemm_fp32_packnto1.c
+++ b/source/c908_opt/fp32/convolution_gemm_fp32_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -32,95 +32,7 @@ int shl_c908_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct c
                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                             struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
-            shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(float);
-            const int vl = vsetvl_e32m1(packn);
-
-            // [in_c/packn, maxk, out_h, out_w, packn]
-            float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                       packn * sizeof(float));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const float *img0 = input_pad_buf + c * padded_in_hw;
-                float *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const float *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
-                                img1 += stride_w * packn;
-                                vse32_v_f32m1(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
-            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                                in_cp * maxk, n, false);
-            shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(output_ncxhwx);
-    return CSINN_TRUE;
+    return shl_rvv_common_conv_gemm_packnto1_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp32,
+                                                  shl_c908_ncxhwx_gemm_12xpack2n_fp32);
 }
diff --git a/source/c908_opt/fp32/depthwise_convolution.c b/source/c908_opt/fp32/depthwise_convolution.c
index c7196991..89d8e744 100644
--- a/source/c908_opt/fp32/depthwise_convolution.c
+++ b/source/c908_opt/fp32/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -46,6 +46,9 @@ int shl_c908_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn
             in_elempack = 1;
             out_elempack = 1;  // dwconv2d out_channel pack is same as in_channel
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
diff --git a/source/c908_opt/fp32/fullyconnected.c b/source/c908_opt/fp32/fullyconnected.c
index 899f20e9..461301e9 100644
--- a/source/c908_opt/fp32/fullyconnected.c
+++ b/source/c908_opt/fp32/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_fullyconnected_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/c908_opt/fp32/gemm_fp32.c b/source/c908_opt/fp32/gemm_fp32.c
index d735e14a..4f0bfd85 100644
--- a/source/c908_opt/fp32/gemm_fp32.c
+++ b/source/c908_opt/fp32/gemm_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: VLEN = 128
diff --git a/source/c908_opt/fp32/gemm_fp32_ncxhwx.S b/source/c908_opt/fp32/gemm_fp32_ncxhwx.S
index 5b4e9ae9..2f451d15 100644
--- a/source/c908_opt/fp32/gemm_fp32_ncxhwx.S
+++ b/source/c908_opt/fp32/gemm_fp32_ncxhwx.S
@@ -132,6 +132,8 @@ pack2nx12_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx12_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx12_k2_end
 
 pack2nx12_k2:
     vle32.v         v5, (t3)
@@ -222,10 +224,98 @@ pack2nx12_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx12_k2
 
-pack2nx12_k1:
+pack2nx12_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 28(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 32(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 36(a2)
+    vfmacc.vf       v12, ft4, v3
+    vfmacc.vf       v24, ft4, v4
+    flw             fa4, 40(a2)
+    vfmacc.vf       v13, ft5, v3
+    vfmacc.vf       v25, ft5, v4
+    flw             fa5, 44(a2)
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v26, fa0, v4
+    flw             ft0, 48(a2)
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v27, fa1, v4
+    flw             ft1, 52(a2)
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v28, fa2, v4
+    flw             ft2, 56(a2)
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v29, fa3, v4
+    flw             ft3, 60(a2)
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v30, fa4, v4
+    flw             ft4, 64(a2)
+    vfmacc.vf       v19, fa5, v3
+    vfmacc.vf       v31, fa5, v4
+    flw             ft5, 68(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flw             fa0, 72(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flw             fa1, 76(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flw             fa2, 80(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flw             fa3, 84(a2)
+    vfmacc.vf       v12, ft4, v5
+    vfmacc.vf       v24, ft4, v6
+    flw             fa4, 88(a2)
+    vfmacc.vf       v13, ft5, v5
+    vfmacc.vf       v25, ft5, v6
+    flw             fa5, 92(a2)
+    addi            a2, a2, 96
+    vfmacc.vf       v14, fa0, v5
+    vfmacc.vf       v26, fa0, v6
+    vfmacc.vf       v15, fa1, v5
+    vfmacc.vf       v27, fa1, v6
+    vfmacc.vf       v16, fa2, v5
+    vfmacc.vf       v28, fa2, v6
+    vfmacc.vf       v17, fa3, v5
+    vfmacc.vf       v29, fa3, v6
+    vfmacc.vf       v18, fa4, v5
+    vfmacc.vf       v30, fa4, v6
+    vfmacc.vf       v19, fa5, v5
+    vfmacc.vf       v31, fa5, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx12_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+    flw             ft4, 16(a2)
+    flw             ft5, 20(a2)
+
+pack2nx12_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     flw             fa0, 24(a2)
@@ -377,6 +467,8 @@ pack2nx8_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx8_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx8_k2_end
 
 pack2nx8_k2:
     vle32.v         v5, (t3)
@@ -443,10 +535,74 @@ pack2nx8_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx8_k2
 
-pack2nx8_k1:
+pack2nx8_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 28(a2)
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v24, fa0, v4
+    flw             ft0, 32(a2)
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v25, fa1, v4
+    flw             ft1, 36(a2)
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v26, fa2, v4
+    flw             ft2, 40(a2)
+    vfmacc.vf       v15, fa3, v3
+    vfmacc.vf       v27, fa3, v4
+    flw             ft3, 44(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flw             fa0, 48(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flw             fa1, 52(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flw             fa2, 56(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flw             fa3, 60(a2)
+    addi            a2, a2, 64
+    vfmacc.vf       v12, fa0, v5
+    vfmacc.vf       v24, fa0, v6
+    vfmacc.vf       v13, fa1, v5
+    vfmacc.vf       v25, fa1, v6
+    vfmacc.vf       v14, fa2, v5
+    vfmacc.vf       v26, fa2, v6
+    vfmacc.vf       v15, fa3, v5
+    vfmacc.vf       v27, fa3, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx8_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+pack2nx8_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     flw             fa0, 16(a2)
@@ -551,6 +707,8 @@ pack2nx4_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx4_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx4_k2_end
 
 pack2nx4_k2:
     vle32.v         v5, (t3)
@@ -593,10 +751,54 @@ pack2nx4_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx4_k2
 
-pack2nx4_k1:
+pack2nx4_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 28(a2)
+    addi            a2, a2, 32
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    flw             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+    flw             ft1, 4(a2)
+    vfmacc.vf       v10, fa2, v5
+    vfmacc.vf       v22, fa2, v6
+    flw             ft2, 8(a2)
+    vfmacc.vf       v11, fa3, v5
+    vfmacc.vf       v23, fa3, v6
+    flw             ft3, 12(a2)
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx4_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+pack2nx4_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     vfmacc.vf       v9, ft1, v3
@@ -660,6 +862,8 @@ pack2nx2_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx2_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx2_k2_end
 
 pack2nx2_k2:
     vle32.v         v5, (t3)
@@ -690,10 +894,38 @@ pack2nx2_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx2_k2
 
-pack2nx2_k1:
+pack2nx2_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 12(a2)
+    addi            a2, a2, 16
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx2_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+
+pack2nx2_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     vfmacc.vf       v9, ft1, v3
@@ -736,6 +968,8 @@ pack2nx1_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, pack2nx1_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, pack2nx1_k2_end
 
 pack2nx1_k2:
     vle32.v         v5, (t3)
@@ -760,10 +994,32 @@ pack2nx1_k2:
     addi            t2, t2, -1
     bnez            t2, pack2nx1_k2
 
-pack2nx1_k1:
+pack2nx1_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 4(a2)
+    addi            a2, a2, 8
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+
     andi            t2, a5, 1   // k1
     beqz            t2, pack2nx1_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+
+pack2nx1_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v20, ft0, v4
     addi            a2, a2, 4
@@ -871,6 +1127,8 @@ packnx12_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx12_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx12_k2_end
 
 packnx12_k2:
     vle32.v         v5, (t3)
@@ -933,10 +1191,70 @@ packnx12_k2:
     addi            t2, t2, -1
     bnez            t2, packnx12_k2
 
-packnx12_k1:
+packnx12_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 28(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 32(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 36(a2)
+    vfmacc.vf       v12, ft4, v3
+    flw             fa4, 40(a2)
+    vfmacc.vf       v13, ft5, v3
+    flw             fa5, 44(a2)
+    vfmacc.vf       v14, fa0, v3
+    flw             ft0, 48(a2)
+    vfmacc.vf       v15, fa1, v3
+    flw             ft1, 52(a2)
+    vfmacc.vf       v16, fa2, v3
+    flw             ft2, 56(a2)
+    vfmacc.vf       v17, fa3, v3
+    flw             ft3, 60(a2)
+    vfmacc.vf       v18, fa4, v3
+    flw             ft4, 64(a2)
+    vfmacc.vf       v19, fa5, v3
+    flw             ft5, 68(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    flw             fa0, 72(a2)
+    vfmacc.vf       v9, ft1, v5
+    flw             fa1, 76(a2)
+    vfmacc.vf       v10, ft2, v5
+    flw             fa2, 80(a2)
+    vfmacc.vf       v11, ft3, v5
+    flw             fa3, 84(a2)
+    vfmacc.vf       v12, ft4, v5
+    flw             fa4, 88(a2)
+    vfmacc.vf       v13, ft5, v5
+    flw             fa5, 92(a2)
+    addi            a2, a2, 96
+    vfmacc.vf       v14, fa0, v5
+    vfmacc.vf       v15, fa1, v5
+    vfmacc.vf       v16, fa2, v5
+    vfmacc.vf       v17, fa3, v5
+    vfmacc.vf       v18, fa4, v5
+    vfmacc.vf       v19, fa5, v5
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx12_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+    flw             ft4, 16(a2)
+    flw             ft5, 20(a2)
+
+packnx12_k1:
     vfmacc.vf       v8, ft0, v3
     flw             fa0, 24(a2)
     vfmacc.vf       v9, ft1, v3
@@ -1027,6 +1345,8 @@ packnx8_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx8_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx8_k2_end
 
 packnx8_k2:
     vle32.v         v5, (t3)
@@ -1073,10 +1393,58 @@ packnx8_k2:
     addi            t2, t2, -1
     bnez            t2, packnx8_k2
 
-packnx8_k1:
+packnx8_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 28(a2)
+    vfmacc.vf       v12, fa0, v3
+    flw             ft0, 32(a2)
+    vfmacc.vf       v13, fa1, v3
+    flw             ft1, 36(a2)
+    vfmacc.vf       v14, fa2, v3
+    flw             ft2, 40(a2)
+    vfmacc.vf       v15, fa3, v3
+    flw             ft3, 44(a2)
+
+    vfmacc.vf       v8, ft0, v5
+    flw             fa0, 48(a2)
+    vfmacc.vf       v9, ft1, v5
+    flw             fa1, 52(a2)
+    vfmacc.vf       v10, ft2, v5
+    flw             fa2, 56(a2)
+    vfmacc.vf       v11, ft3, v5
+    flw             fa3, 60(a2)
+    addi            a2, a2, 64
+    vfmacc.vf       v12, fa0, v5
+    flw             ft0, 0(a2)
+    vfmacc.vf       v13, fa1, v5
+    flw             ft1, 4(a2)
+    vfmacc.vf       v14, fa2, v5
+    flw             ft2, 8(a2)
+    vfmacc.vf       v15, fa3, v5
+    flw             ft3, 12(a2)
+
     andi            t2, a5, 1   // k2
     beqz            t2, packnx8_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+packnx8_k1:
     vfmacc.vf       v8, ft0, v3
     flw             fa0, 16(a2)
     vfmacc.vf       v9, ft1, v3
@@ -1142,6 +1510,8 @@ packnx4_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx4_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx4_k2_end
 
 packnx4_k2:
     vle32.v         v5, (t3)
@@ -1172,10 +1542,42 @@ packnx4_k2:
     addi            t2, t2, -1
     bnez            t2, packnx4_k2
 
-packnx4_k1:
+packnx4_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 28(a2)
+    addi            a2, a2, 32
+
+    vfmacc.vf       v8, fa0, v5
+    flw             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    flw             ft1, 4(a2)
+    vfmacc.vf       v10, fa2, v5
+    flw             ft2, 8(a2)
+    vfmacc.vf       v11, fa3, v5
+    flw             ft3, 12(a2)
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx4_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+packnx4_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v9, ft1, v3
     vfmacc.vf       v10, ft2, v3
@@ -1216,6 +1618,8 @@ packnx2_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx2_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx2_k2_end
 
 packnx2_k2:
     vle32.v         v5, (t3)
@@ -1238,10 +1642,32 @@ packnx2_k2:
     addi            t2, t2, -1
     bnez            t2, packnx2_k2
 
-packnx2_k1:
+packnx2_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 12(a2)
+    addi            a2, a2, 16
+
+    vfmacc.vf       v8, fa0, v5
+    flw             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    flw             ft1, 4(a2)
+
     andi            t2, a5, 1   // k1
     beqz            t2, packnx2_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+
+packnx2_k1:
     vfmacc.vf       v8, ft0, v3
     vfmacc.vf       v9, ft1, v3
     addi            a2, a2, 8
@@ -1272,6 +1698,8 @@ packnx1_start:
 
     srai            t2, a5, 1   // k2
     beqz            t2, packnx1_k1
+    addi            t2, t2, -1  // k2_end
+    beqz            t2, packnx1_k2_end
 
 packnx1_k2:
     vle32.v         v5, (t3)
@@ -1288,10 +1716,26 @@ packnx1_k2:
     addi            t2, t2, -1
     bnez            t2, packnx1_k2
 
-packnx1_k1:
+packnx1_k2_end:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 4(a2)
+    addi            a2, a2, 8
+
+    add             t3, t3, t0  // +packn
+    vfmacc.vf       v8, fa0, v5
+
     andi            t2, a5, 1   // k2
     beqz            t2, packnx1_relu
 
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+
+packnx1_k1:
     vfmacc.vf       v8, ft0, v3
     addi            a2, a2, 4
 
diff --git a/source/c908_opt/fp32/gemm_fp32_packn.c b/source/c908_opt/fp32/gemm_fp32_packn.c
index 6b27911f..61599ac1 100644
--- a/source/c908_opt/fp32/gemm_fp32_packn.c
+++ b/source/c908_opt/fp32/gemm_fp32_packn.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void gemm_fp32_ncxhwx_12xpack2n(float *output, const float *kernel, const float *input,
                                 const float *bias, int m, int k, int n, bool fuse_relu);
 void gemm_fp32_ncxhwx_12xpackn(float *output, const float *kernel, const float *input,
                                const float *bias, int m, int k, int n, bool fuse_relu);
 
-void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb,
-                                         const float *bias, int m, int k, int n, bool fuse_relu)
+void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
+                                         int m, int k, int n, bool fuse_relu)
 {
     const int packn = csrr_vlenb() / sizeof(float);
     const int pack2n = packn * 2;
diff --git a/source/c908_opt/fp32/gemm_fp32_v256.c b/source/c908_opt/fp32/gemm_fp32_v256.c
index 0c383466..a8bfa4d7 100644
--- a/source/c908_opt/fp32/gemm_fp32_v256.c
+++ b/source/c908_opt/fp32/gemm_fp32_v256.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: VLEN = 256
diff --git a/source/c908_opt/fp32/maxpool.c b/source/c908_opt/fp32/maxpool.c
index 9f9add1a..cba58f56 100644
--- a/source/c908_opt/fp32/maxpool.c
+++ b/source/c908_opt/fp32/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global maxpool2d // TODO: remove
diff --git a/source/c908_opt/int4/convolution.c b/source/c908_opt/int4/convolution.c
index e4384695..bf84d942 100644
--- a/source/c908_opt/int4/convolution.c
+++ b/source/c908_opt/int4/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 #ifdef SHL_USE_DOT_INT4
 int shl_c908_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -31,8 +31,8 @@ int shl_c908_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     // xxx: only int4 support nhwc layout now
@@ -43,8 +43,8 @@ int shl_c908_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *o
         in_w = input->dim[2];
         kernel_h = kernel->dim[1];
         kernel_w = kernel->dim[2];
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             if (input->dtype == CSINN_DTYPE_INT4) {
                 params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
diff --git a/source/c908_opt/int4/depthwise_convolution.c b/source/c908_opt/int4/depthwise_convolution.c
index 0fbd14b9..ce8a0ae2 100644
--- a/source/c908_opt/int4/depthwise_convolution.c
+++ b/source/c908_opt/int4/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 #ifdef SHL_USE_DOT_INT4
 int shl_c908_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
diff --git a/source/c908_opt/int4/fullyconnected.c b/source/c908_opt/int4/fullyconnected.c
index 0dc9ba52..3b5e7f2f 100644
--- a/source/c908_opt/int4/fullyconnected.c
+++ b/source/c908_opt/int4/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_fullyconnected_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/c908_opt/int4/gemm_int4_dot_ncxhwx.S b/source/c908_opt/int4/gemm_int4_dot_ncxhwx.S
index 88156f47..1ec58a70 100644
--- a/source/c908_opt/int4/gemm_int4_dot_ncxhwx.S
+++ b/source/c908_opt/int4/gemm_int4_dot_ncxhwx.S
@@ -134,6 +134,8 @@ packnx12_start:
     lwd             t5, t6, 16(a2)
 
     srai            s8, a4, 3   // k8(k2)
+    addi            s8, s8, -1  // k8(k2)_end
+    beqz            s8, packnx12_k2_end
 
 packnx12_k2:
     vle32.v         v6, (s9)
@@ -186,6 +188,48 @@ packnx12_k2:
     addi            s8, s8, -1
     bnez            s8, packnx12_k2
 
+packnx12_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    vmaqa.vx        v12, t3, v4
+    lwd             s1, s2, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, t5, v4
+    lwd             s3, s4, 0(a2)
+    lwd             s5, s6, 8(a2)
+    vmaqa.vx        v18, t6, v4
+    vmaqa.vx        v20, s1, v4
+    vmaqa.vx        v22, s2, v4
+    lwd             t1, t2, 16(a2)
+    lwd             t3, t4, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v24, s3, v4
+    vmaqa.vx        v26, s4, v4
+    lwd             t5, t6, 0(a2)
+    vmaqa.vx        v28, s5, v4
+    vmaqa.vx        v30, s6, v4
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 8(a2)
+    lwd             s3, s4, 16(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    lwd             s5, s6, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v16, t5, v6
+    vmaqa.vx        v18, t6, v6
+    vmaqa.vx        v20, s1, v6
+    vmaqa.vx        v22, s2, v6
+    vmaqa.vx        v24, s3, v6
+    vmaqa.vx        v26, s4, v6
+    vmaqa.vx        v28, s5, v6
+    vmaqa.vx        v30, s6, v6
+
 packnx12_post:
     srai            s7, t0, 2
     vsetvli         zero, s7, e32, m2   // set vl = 8
@@ -259,6 +303,8 @@ packnx8_start:
     lwd             t3, t4, 8(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k8(k2)_end
+    beqz            s8, packnx8_k2_end
 
 packnx8_k2:
     vle32.v         v6, (s9)
@@ -298,6 +344,37 @@ packnx8_k2:
     addi            s8, s8, -1
     bnez            s8, packnx8_k2
 
+packnx8_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
+
 packnx8_post:
     srai            s7, t0, 2
     vsetvli         zero, s7, e32, m2   // set vl = 8
@@ -351,6 +428,8 @@ packnx4_start:
     lwd             t3, t4, 8(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k8(k2)_end
+    beqz            s8, packnx4_k2_end
 
 packnx4_k2:
     vle32.v         v6, (s9)
@@ -377,6 +456,23 @@ packnx4_k2:
     addi            s8, s8, -1
     bnez            s8, packnx4_k2
 
+packnx4_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
 packnx4_post:
     srai            s7, t0, 2
     vsetvli         zero, s7, e32, m2   // set vl = 8
@@ -416,6 +512,8 @@ packnx2_start:
     lwd             t1, t2, 0(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k8(k2)_end
+    beqz            s8, packnx2_k2_end
 
 packnx2_k2:
     vle32.v         v6, (s9)
@@ -435,6 +533,19 @@ packnx2_k2:
 
     addi            s8, s8, -1
     bnez            s8, packnx2_k2
+
+packnx2_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+
 packnx2_post:
     srai            s7, t0, 2
     vsetvli         zero, s7, e32, m2   // set vl = 8
@@ -467,6 +578,8 @@ packnx1_start:
     lw              t1, 0(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k8(k2)_end
+    beqz            s8, packnx1_k2_end
 
 packnx1_k2:
     vle32.v         v6, (s9)
@@ -485,6 +598,16 @@ packnx1_k2:
     addi            s8, s8, -1
     bnez            s8, packnx1_k2
 
+packnx1_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vmaqa.vx        v8, s1, v6
+
 packnx1_post:
     srai            s7, t0, 2
     vsetvli         zero, s7, e32, m2   // set vl = 8
@@ -610,6 +733,8 @@ packnx8_start_1:
     lwd             t3, t4, 8(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k2_end
+    beqz            s8, packnx8_k2_end_1
 
 packnx8_k2_1:
     vle32.v         v6, (s9)
@@ -649,6 +774,36 @@ packnx8_k2_1:
     addi            s8, s8, -1
     bnez            s8, packnx8_k2_1
 
+packnx8_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
 packnx8_post_1:
     vsetvli         zero, s7, e32, m2   // set vl = 8
     vle32.v         v4, (a7)    // mult
@@ -706,6 +861,8 @@ packnx4_start_1:
     lwd             t3, t4, 8(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k2_end
+    beqz            s8, packnx4_k2_end_1
 
 packnx4_k2_1:
     vle32.v         v6, (s9)
@@ -732,6 +889,23 @@ packnx4_k2_1:
     addi            s8, s8, -1
     bnez            s8, packnx4_k2_1
 
+packnx4_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
 packnx4_post_1:
     vsetvli         zero, s7, e32, m2   // set vl = 8
     vle32.v         v4, (a7)    // mult
@@ -770,6 +944,8 @@ packnx2_start_1:
     lwd             t1, t2, 0(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k2_end
+    beqz            s8, packnx2_k2_end_1
 
 packnx2_k2_1:
     vle32.v         v6, (s9)
@@ -790,6 +966,18 @@ packnx2_k2_1:
     addi            s8, s8, -1
     bnez            s8, packnx2_k2_1
 
+packnx2_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+
 packnx2_post_1:
     vsetvli         zero, s7, e32, m2   // set vl = 8
     vle32.v         v4, (a7)    // mult
@@ -821,6 +1009,8 @@ packnx1_start_1:
     lw              t1, 0(a2)
 
     srai            s8, a4, 3   // k2
+    addi            s8, s8, -1  // k2_end
+    beqz            s8, packnx1_k2_end_1
 
 packnx1_k2_1:
     vle32.v         v6, (s9)
@@ -839,6 +1029,16 @@ packnx1_k2_1:
     addi            s8, s8, -1
     bnez            s8, packnx1_k2_1
 
+packnx1_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vmaqa.vx        v8, s1, v6
+
 packnx1_post_1:
     vsetvli         zero, s7, e32, m2   // set vl = 8
     vle32.v         v4, (a7)    // mult
diff --git a/source/c908_opt/int8/avgpool.c b/source/c908_opt/int8/avgpool.c
index 47adf80c..4c07f755 100644
--- a/source/c908_opt/int8/avgpool.c
+++ b/source/c908_opt/int8/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global avgpool2d
diff --git a/source/c908_opt/int8/convolution.c b/source/c908_opt/int8/convolution.c
index 8b7dfd60..004df969 100644
--- a/source/c908_opt/int8/convolution.c
+++ b/source/c908_opt/int8/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -30,8 +30,8 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
@@ -48,18 +48,21 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o
         if (shl_is_first_layer_input(input, sess)) {
             in_elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     // packn
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
             shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_packn_int8;
             // } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-            //            dalition_h == 1 && dalition_w == 1) {
+            //            dilation_h == 1 && dilation_w == 1) {
             //     if (params->group > 1) {
             //         params->conv_extra.conv_mode = CSINN_GEMM;
             //         params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
@@ -84,8 +87,8 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o
     if (in_elempack % packn != 0 && out_elempack % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_int8;
         } else {
@@ -98,8 +101,8 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o
     if (in_elempack % packn == 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_packnto1_int8;
         } else {
@@ -112,8 +115,8 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o
     if (in_elempack % packn != 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_c908_conv1x1s1_gemm_reorder_kernel_int8(kernel, params);
             cb->exec = shl_c908_conv1x1s1_gemm_int8;
         } else {
diff --git a/source/c908_opt/int8/convolution_1x1_int8.c b/source/c908_opt/int8/convolution_1x1_int8.c
index 8a528afd..cb162166 100644
--- a/source/c908_opt/int8/convolution_1x1_int8.c
+++ b/source/c908_opt/int8/convolution_1x1_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                                  struct csinn_conv2d_params *params)
@@ -41,72 +41,16 @@ int shl_c908_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor
                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                  struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    // int8_t *kernel_data = (int8_t *)kernel->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-    int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
-
-    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
+#ifdef SHL_USE_DOT_INT8
     const int vlen = csrr_vlenb() * 8;
-
-    int j = 0;
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            int8_t *pa = kernel_data + g * m * k4;
-            int8_t *pb = pb_reorder;
-            int8_t *pc = output_data;
-
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
-
-            if (vlen == 128) {
-                // pack
-                shl_c908_reorder_input_z8_int8_dot(input_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x8_int8_dot(pc, pa, pb, bias_data + g * m, m, k4, n, n,
-                                           output->qinfo->zero_point, multiplier, shift);
-            } else if (vlen >= 256) {
-                // pack
-                shl_c908_reorder_input_z16_int8_v256_dot(input_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x16_int8_v256_dot(pc, pa, pb, bias_data + g * m, m, k4, n, n,
-                                                 output->qinfo->zero_point, multiplier, shift);
-            }
-
-            input_data += k * n;
-            output_data += m * n;
-        }
+    if (vlen == 128) {
+        return shl_rvv_common_conv1x1_gemm_int8(input, output, kernel, bias, params,
+                                                shl_c908_reorder_input_z8_int8_dot,
+                                                shl_c908_gemm_8x8_int8_dot);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv1x1_gemm_int8(input, output, kernel, bias, params,
+                                                shl_c908_reorder_input_z16_int8_v256_dot,
+                                                shl_c908_gemm_8x16_int8_v256_dot);
     }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    return CSINN_TRUE;
+#endif  // SHL_USE_DOT_INT8
 }
diff --git a/source/c908_opt/int8/convolution_1x1_int8_pack1ton.c b/source/c908_opt/int8/convolution_1x1_int8_pack1ton.c
index 418ec51f..1d347974 100644
--- a/source/c908_opt/int8/convolution_1x1_int8_pack1ton.c
+++ b/source/c908_opt/int8/convolution_1x1_int8_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -28,111 +28,17 @@ void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *k
     shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params);
 }
 
-static void reorder_input_pack1ton_align4_int8(const int8_t *src, int8_t *dst, int inc, int inh,
-                                               int inw)
-{
-    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-    int vl = vsetvl_e8mf2(packn);
-    const int in_size = inh * inw;  // per-channel size
-
-    while (inc > 0) {
-        vl = vsetvl_e8mf2(inc);
-        int vl4 = ((vl - 1) & -4) + 4;
-        int8_t *in_ptr = (int8_t *)src;
-        for (int i = 0; i < inh; i++) {
-            for (int j = 0; j < inw; j++) {
-                vint8mf2_t _tmp = vlse8_v_i8mf2(in_ptr, in_size * sizeof(int8_t), vl);
-                in_ptr++;
-                vse8_v_i8mf2(dst, _tmp, vl);
-                dst += vl4;
-            }
-        }
-        src += in_size * vl;
-        inc -= vl;
-    }
-}
-
 int shl_c908_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                           struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_c / group;
-    int32_t k = in_c / group;
-    int32_t n = out_h * out_w;
-
-#ifdef SHL_USE_DOT_INT8
-    int32_t k4 = ((k - 1) & -4) + 4;
-    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
-    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
-#else
-    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
-    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
-#endif  // SHL_USE_DOT_INT8
-
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0, j = 0; g < group; g++) {
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
-            int8_t *in_ptr = pb_reorder;
-            int8_t *out_ptr = output_data;
-            int32_t *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
 #ifdef SHL_USE_DOT_INT8
-            int8_t *kernel_ptr = kernel_data + g * m * k4;
-            reorder_input_pack1ton_align4_int8(input_data, input_ncxhwx, k, out_h, out_w);
-            shl_rvv_reorder_input_z12_pack1ton_int8_dot(input_ncxhwx, in_ptr, k4, 1, n, n);
-            shl_c908_ncxhwx_gemm_12xpackn_int8_dot(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n,
-                                                   output->qinfo->zero_point, multiplier, shift);
+    return shl_rvv_common_conv1x1_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_int8_dot,
+                                                     shl_c908_ncxhwx_gemm_12xpackn_int8_dot);
 #else
-            int8_t *kernel_ptr = kernel_data + g * m * k;
-            shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, k, out_h, out_w);
-            shl_rvv_reorder_input_z4_pack1ton_int8(input_ncxhwx, in_ptr, k, 1, n, n);
-            shl_c908_ncxhwx_gemm_4xpack2n_int8(output_data, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                               output->qinfo->zero_point, multiplier, shift);
+    return shl_rvv_common_conv1x1_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z4_pack1ton_int8,
+                                                     shl_c908_ncxhwx_gemm_4xpack2n_int8);
 #endif  // SHL_USE_DOT_INT8
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    shl_mem_free(pb_reorder);
-    shl_mem_free(input_ncxhwx);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/int8/convolution_1x1_int8_packn.c b/source/c908_opt/int8/convolution_1x1_int8_packn.c
index bcfa3fe3..29bb5d34 100644
--- a/source/c908_opt/int8/convolution_1x1_int8_packn.c
+++ b/source/c908_opt/int8/convolution_1x1_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
                                                        struct csinn_conv2d_params *params)
@@ -28,69 +28,13 @@ int shl_c908_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                        struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0, j = 0; g < group; g++) {
-            int8_t *kernel_ptr = kernel_data + g * m * k;
-            int8_t *in_ptr = pb_reorder;
-            int8_t *out_ptr = output_data;
-            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
-
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
 #ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(input_data, pb_reorder, k, n, n);
-            shl_c908_ncxhwx_gemm_12xpackn_int8_dot(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                                   output->qinfo->zero_point, multiplier, shift);
+    return shl_rvv_common_conv1x1_gemm_packn_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                  shl_c908_ncxhwx_gemm_12xpackn_int8_dot);
 #else
-            shl_rvv_reorder_input_z4_packn_int8(input_data, pb_reorder, k, n, n);
-            shl_c908_ncxhwx_gemm_4xpack2n_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                               output->qinfo->zero_point, multiplier, shift);
+    return shl_rvv_common_conv1x1_gemm_packn_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z4_packn_int8,
+                                                  shl_c908_ncxhwx_gemm_4xpack2n_int8);
 #endif  // SHL_USE_DOT_INT8
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/int8/convolution_1x1_int8_packnto1.c b/source/c908_opt/int8/convolution_1x1_int8_packnto1.c
index 24509357..727ed8fd 100644
--- a/source/c908_opt/int8/convolution_1x1_int8_packnto1.c
+++ b/source/c908_opt/int8/convolution_1x1_int8_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
                                                           struct csinn_conv2d_params *params)
@@ -28,69 +28,13 @@ int shl_c908_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csi
                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                           struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
-    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0, j = 0; g < group; g++) {
-            int8_t *kernel_ptr = kernel_data + g * m * k;
-            int8_t *in_ptr = pb_reorder;
-            int8_t *out_ptr = output_data;
-            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
-
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
-
 #ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(input_data, pb_reorder, k, n, n);
-            shl_c908_ncxhwx_gemm_12xpackn_int8_dot(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m,
-                                                   k, n, output->qinfo->zero_point, multiplier,
-                                                   shift);
+    return shl_rvv_common_conv1x1_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                     shl_c908_ncxhwx_gemm_12xpackn_int8_dot);
 #else
-            shl_rvv_reorder_input_z4_packn_int8(input_data, pb_reorder, k, n, n);
-            shl_c908_ncxhwx_gemm_4xpack2n_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                               output->qinfo->zero_point, multiplier, shift);
+    return shl_rvv_common_conv1x1_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z4_packn_int8,
+                                                     shl_c908_ncxhwx_gemm_4xpack2n_int8);
 #endif  // SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    shl_mem_free(output_ncxhwx);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/int8/convolution_3x3_int8.c b/source/c908_opt/int8/convolution_3x3_int8.c
index e6111f5f..a01cc807 100644
--- a/source/c908_opt/int8/convolution_3x3_int8.c
+++ b/source/c908_opt/int8/convolution_3x3_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
     note: VLEN = 128
diff --git a/source/c908_opt/int8/convolution_3x3_int8_packn.c b/source/c908_opt/int8/convolution_3x3_int8_packn.c
index 1c3e4f52..bf57ee17 100644
--- a/source/c908_opt/int8/convolution_3x3_int8_packn.c
+++ b/source/c908_opt/int8/convolution_3x3_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 #ifdef NNN
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/c908_opt/int8/convolution_3x3_int8_packn_1.c b/source/c908_opt/int8/convolution_3x3_int8_packn_1.c
index 58ece966..2d5e2ecf 100644
--- a/source/c908_opt/int8/convolution_3x3_int8_packn_1.c
+++ b/source/c908_opt/int8/convolution_3x3_int8_packn_1.c
@@ -17,7 +17,7 @@
  */
 
 // #ifdef NNN
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/c908_opt/int8/convolution_gemm_int8.c b/source/c908_opt/int8/convolution_gemm_int8.c
index 8b8fb8f2..b9bcf954 100644
--- a/source/c908_opt/int8/convolution_gemm_int8.c
+++ b/source/c908_opt/int8/convolution_gemm_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                                    struct csinn_conv2d_params *params)
@@ -44,115 +44,16 @@ int shl_c908_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tens
                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                    struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    // int8_t *kernel_data = (int8_t *)kernel->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_ch = input->dim[1];
-    int32_t in_height = input->dim[2];
-    int32_t in_width = input->dim[3];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_height = output->dim[2];
-    int32_t out_width = output->dim[3];
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t pad_left = params->pad_left;
-    int32_t pad_top = params->pad_top;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group * ksize_h * ksize_w;
-    int32_t n = out_height * out_width;
-    int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
-
-    int8_t *im2col_data = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
-    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
-
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
+#ifdef SHL_USE_DOT_INT8
     const int vlen = csrr_vlenb() * 8;
-
-    int j = 0;
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // im2col
-            int8_t *data_col = im2col_data;
-            int8_t *channel_data = input_data;
-            for (int c = 0; c < in_ch / group; c++) {
-                for (int kh = 0; kh < ksize_h; kh++) {
-                    for (int kw = 0; kw < ksize_w; kw++) {
-                        int in_row = -pad_top + kh * dilation_h;
-                        for (int oh = 0; oh < out_height; oh++) {
-                            if (in_row >= in_height || in_row < 0) {
-                                for (int ow = 0; ow < out_width; ow++) {
-                                    *data_col++ = input->qinfo->zero_point;
-                                }
-                            } else {
-                                int in_col = -pad_left + kw * dilation_w;
-                                for (int ow1 = 0; ow1 < out_width; ow1++) {
-                                    int col_idx = (c * out_height + oh) * out_width + ow1;
-                                    if (in_col < in_width && in_col >= 0) {
-                                        *data_col++ = channel_data[in_row * in_width + in_col];
-                                    } else {
-                                        *data_col++ = input->qinfo->zero_point;
-                                    }
-                                    in_col += stride_w;
-                                }
-                            }
-                            in_row += stride_h;
-                        }
-                    }
-                }
-                channel_data += in_height * in_width;
-            }
-
-            int8_t *pa = kernel_data + g * m * k4;
-            int8_t *pb = pb_reorder;
-            int8_t *pc = output_data;
-
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
-
-            if (vlen == 128) {
-                // pack
-                shl_c908_reorder_input_z8_int8_dot(im2col_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x8_int8_dot(pc, pa, pb, bias_data + g * m, m, k4, n, n,
-                                           output->qinfo->zero_point, multiplier, shift);
-            } else if (vlen >= 256) {
-                // pack
-                shl_c908_reorder_input_z16_int8_v256_dot(im2col_data, pb, k, n, n);
-                // GEMM
-                shl_c908_gemm_8x16_int8_v256_dot(pc, pa, pb, bias_data + g * m, m, k4, n, n,
-                                                 output->qinfo->zero_point, multiplier, shift);
-            }
-            input_data += in_ch / group * in_height * in_width;
-            output_data += m * n;
-        }
+    if (vlen == 128) {
+        return shl_rvv_common_conv_gemm_int8(input, output, kernel, bias, params,
+                                             shl_c908_reorder_input_z8_int8_dot,
+                                             shl_c908_gemm_8x8_int8_dot);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv_gemm_int8(input, output, kernel, bias, params,
+                                             shl_c908_reorder_input_z16_int8_v256_dot,
+                                             shl_c908_gemm_8x16_int8_v256_dot);
     }
-    shl_mem_free(pb_reorder);
-    shl_mem_free(im2col_data);
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    return CSINN_TRUE;
+#endif  // SHL_USE_DOT_INT8
 }
diff --git a/source/c908_opt/int8/convolution_gemm_int8_pack1ton.c b/source/c908_opt/int8/convolution_gemm_int8_pack1ton.c
index 66053ba2..011f05c5 100644
--- a/source/c908_opt/int8/convolution_gemm_int8_pack1ton.c
+++ b/source/c908_opt/int8/convolution_gemm_int8_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
                                                             struct csinn_conv2d_params *params)
@@ -28,167 +28,13 @@ int shl_c908_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct c
                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                             struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
-        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0, j = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
-            shl_rvv_pad_input_pack1ton_int8(input_data, input_pad_buf, in_cp, in_h, in_w,
-                                            padded_in_h, padded_in_w, params->pad_top,
-                                            params->pad_left, input->qinfo->zero_point);
-
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
-            int32_t *bias_ptr = bias_data + g * m;
-
-            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-            int vl = vsetvl_e8mf2(packn);
 #ifdef SHL_USE_DOT_INT8
-            // im2col
-            int in_cp4 = ((in_cp - 1) & -4) + 4;
-            // [in_cp4/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_cp4%packn]
-            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w);
-
-            const int8_t *img0 = input_pad_buf;
-            int8_t *dst_ptr = im2col_buf;
-
-            int loop_c = in_cp;
-            while (loop_c > 0) {
-                vl = vsetvl_e8mf2(loop_c);
-                int vl4 = ((vl - 1) & -4) + 4;
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const int8_t *img1 =
-                            img0 + a * dilation_h * padded_in_w * vl + b * dilation_w * vl;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
-                                img1 += stride_w * vl;
-                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
-                                dst_ptr += vl4;  // XXX: dst align 4
-                            }
-                            img1 += tailstep * vl;
-                        }
-                    }
-                }
-                img0 += padded_in_hw * vl;
-                // dst_ptr += maxk * out_h * out_w * vl;
-                loop_c -= vl;
-            }
-            shl_mem_free(input_pad_buf);
-            // reorder(pack)
-            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t));
-            shl_rvv_reorder_input_z12_pack1ton_int8_dot(im2col_buf, reorder_buf, in_cp4, maxk, n,
-                                                        n);
-            shl_mem_free(im2col_buf);
-            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp4;
-            // gemm
-            shl_c908_ncxhwx_gemm_12xpackn_int8_dot(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                   in_cp4 * maxk, n, output->qinfo->zero_point,
-                                                   multiplier, shift);
+    return shl_rvv_common_conv_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_int8_dot,
+                                                  shl_c908_ncxhwx_gemm_12xpackn_int8_dot);
 #else
-            // im2col
-            // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
-            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w);
-
-            const int8_t *img0 = input_pad_buf;
-            int8_t *dst_ptr = im2col_buf;
-
-            int loop_c = in_cp;
-            while (loop_c > 0) {
-                vl = vsetvl_e8mf2(loop_c);
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const int8_t *img1 =
-                            img0 + a * dilation_h * padded_in_w * vl + b * dilation_w * vl;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
-                                img1 += stride_w * vl;
-                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
-                                dst_ptr += vl;
-                            }
-                            img1 += tailstep * vl;
-                        }
-                    }
-                }
-                img0 += padded_in_hw * vl;
-                loop_c -= vl;
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
-            shl_rvv_reorder_input_z4_pack1ton_int8(im2col_buf, reorder_buf, in_cp, maxk, n, n);
-            shl_mem_free(im2col_buf);
-            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            // gemm
-            shl_c908_ncxhwx_gemm_4xpack2n_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, output->qinfo->zero_point,
-                                               multiplier, shift);
+    return shl_rvv_common_conv_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z4_pack1ton_int8,
+                                                  shl_c908_ncxhwx_gemm_4xpack2n_int8);
 #endif  // SHL_USE_DOT_INT8
-
-            shl_mem_free(reorder_buf);
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/int8/convolution_gemm_int8_packn.c b/source/c908_opt/int8/convolution_gemm_int8_packn.c
index 33c670d8..54c32b54 100644
--- a/source/c908_opt/int8/convolution_gemm_int8_packn.c
+++ b/source/c908_opt/int8/convolution_gemm_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params)
@@ -28,125 +28,13 @@ int shl_c908_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csin
                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                          struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0, j = 0; g < group; g++) {
-            // paddding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_w * padded_in_h;
-            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
-            shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left,
-                                         input->qinfo->zero_point);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-            const int vl = vsetvl_e8mf2(packn);
-
-            // [in_c/packn, maxk, out_h, out_w, packn]
-            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                         packn * sizeof(int8_t));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const int8_t *img0 = input_pad_buf + c * padded_in_hw;
-                int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const int8_t *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
-                                img1 += stride_w * packn;
-                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
-
-            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
-
-            // reorder(pack)
-            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
 #ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-            shl_c908_ncxhwx_gemm_12xpackn_int8_dot(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                   in_cp * maxk, n, output->qinfo->zero_point,
-                                                   multiplier, shift);
+    return shl_rvv_common_conv_gemm_packn_int8(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_int8_dot,
+                                               shl_c908_ncxhwx_gemm_12xpackn_int8_dot);
 #else
-            shl_rvv_reorder_input_z4_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-            shl_c908_ncxhwx_gemm_4xpack2n_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, output->qinfo->zero_point,
-                                               multiplier, shift);
+    return shl_rvv_common_conv_gemm_packn_int8(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z4_packn_int8,
+                                               shl_c908_ncxhwx_gemm_4xpack2n_int8);
 #endif  // SHL_USE_DOT_INT8
-
-            shl_mem_free(reorder_buf);
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/int8/convolution_gemm_int8_packnto1.c b/source/c908_opt/int8/convolution_gemm_int8_packnto1.c
index c5fe38c0..bd3ae6cc 100644
--- a/source/c908_opt/int8/convolution_gemm_int8_packnto1.c
+++ b/source/c908_opt/int8/convolution_gemm_int8_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
                                                             struct csinn_conv2d_params *params)
@@ -28,122 +28,13 @@ int shl_c908_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct c
                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                             struct csinn_conv2d_params *params)
 {
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
-    }
-    int8_t *input_data = (int8_t *)input->data;
-    int8_t *output_data = (int8_t *)output->data;
-    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
-    int32_t *bias_data = (int32_t *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
-
-    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0, j = 0; g < group; g++) {
-            // paddding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
-            shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left,
-                                         input->qinfo->zero_point);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
-            const int vl = vsetvl_e8mf2(packn);
-
-            // [in_c/packn, maxk, out_h, out_w, packn]
-            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                         packn * sizeof(int8_t));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const int8_t *img0 = input_pad_buf + c * padded_in_hw;
-                int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const int8_t *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
-                                img1 += stride_w * packn;
-                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            if (kernel->quant_channel > 1) {
-                for (int c = 0; c < m; c++, j++) {
-                    multiplier[c] = kernel->qinfo[j].multiplier;
-                    shift[c] = kernel->qinfo[j].shift;
-                }
-            } else if (kernel->quant_channel == 1) {
-                for (int c = 0; c < m; c++) {
-                    multiplier[c] = kernel->qinfo[0].multiplier;
-                    shift[c] = kernel->qinfo[0].shift;
-                }
-            }
-
-            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
-
-            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
-
 #ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-            shl_c908_ncxhwx_gemm_12xpackn_int8_dot(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                                   in_cp * maxk, n, output->qinfo->zero_point,
-                                                   multiplier, shift);
+    return shl_rvv_common_conv_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                  shl_c908_ncxhwx_gemm_12xpackn_int8_dot);
 #else
-            shl_rvv_reorder_input_z4_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-            shl_c908_ncxhwx_gemm_4xpack2n_int8(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, output->qinfo->zero_point,
-                                               multiplier, shift);
+    return shl_rvv_common_conv_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z4_packn_int8,
+                                                  shl_c908_ncxhwx_gemm_4xpack2n_int8);
 #endif  // SHL_USE_DOT_INT8
-
-            shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(multiplier);
-    shl_mem_free(shift);
-    shl_mem_free(output_ncxhwx);
-    return CSINN_TRUE;
 }
diff --git a/source/c908_opt/int8/depthwise_convolution.c b/source/c908_opt/int8/depthwise_convolution.c
index c135ba85..f4318cb8 100644
--- a/source/c908_opt/int8/depthwise_convolution.c
+++ b/source/c908_opt/int8/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -46,6 +46,9 @@ int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn
             in_elempack = 1;
             out_elempack = 1;  // dwconv2d out_channel pack is same as in_channel
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     // enable fuse zeropoint to bias
diff --git a/source/c908_opt/int8/fullyconnected.c b/source/c908_opt/int8/fullyconnected.c
index a5d0055d..79cfec1e 100644
--- a/source/c908_opt/int8/fullyconnected.c
+++ b/source/c908_opt/int8/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_fullyconnected_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/c908_opt/int8/gemm_int16_ncxhwx.S b/source/c908_opt/int8/gemm_int16_ncxhwx.S
index 7366119f..5b3a139e 100644
--- a/source/c908_opt/int8/gemm_int16_ncxhwx.S
+++ b/source/c908_opt/int8/gemm_int16_ncxhwx.S
@@ -109,6 +109,8 @@ packnx12_start:
     srli            t4, t3, 16
 
     srai            t6, a3, 1   // k2
+    addi            t6, t6, -1  // k2_end
+    beqz            t6, packnx12_k2_end
 
 packnx12_k2:
     vle16.v         v4, (a5)
@@ -166,6 +168,53 @@ packnx12_k2:
     addi            t6, t6, -1
     bnez            t6, packnx12_k2
 
+packnx12_k2_end:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    vwmacc.vx       v12, t3, v2
+    lwd             s1, s3, 8(a2)
+    vwmacc.vx       v10, t2, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v14, t4, v2
+    srli            s4, s3, 16
+    vwmacc.vx       v16, s1, v2
+    vwmacc.vx       v20, s3, v2
+    lwd             t1, t3, 16(a2)
+    addi            a2, a2, 24
+    vwmacc.vx       v18, s2, v2
+    srli            t2, t1, 16
+    vwmacc.vx       v22, s4, v2
+    srli            t4, t3, 16
+    vwmacc.vx       v24, t1, v2
+    vwmacc.vx       v28, t3, v2
+    lwd             s1, s3, 0(a2)
+    vwmacc.vx       v26, t2, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v30, t4, v2
+    srli            s4, s3, 16
+
+    vwmacc.vx       v8, s1, v4
+    vwmacc.vx       v12, s3, v4
+    lwd             t1, t3, 8(a2)
+    vwmacc.vx       v10, s2, v4
+    srli            t2, t1, 16
+    vwmacc.vx       v14, s4, v4
+    srli            t4, t3, 16
+    vwmacc.vx       v16, t1, v4
+    vwmacc.vx       v20, t3, v4
+    lwd             s1, s3, 16(a2)
+    addi            a2, a2, 24
+    vwmacc.vx       v18, t2, v4
+    srli            s2, s1, 16
+    vwmacc.vx       v22, t4, v4
+    srli            s4, s3, 16
+    vwmacc.vx       v24, s1, v4
+    vwmacc.vx       v28, s3, v4
+    vwmacc.vx       v26, s2, v4
+    vwmacc.vx       v30, s4, v4
+
 packnx12_end:
     vsetvli         zero, zero, e32, m2
     vse32.v         v8, (a0)
@@ -228,6 +277,8 @@ packnx8_start:
     srli            t4, t3, 16
 
     srai            t6, a3, 1   // k2
+    addi            t6, t6, -1  // k2_end
+    beqz            t6, packnx8_k2_end
 
 packnx8_k2:
     vle16.v         v4, (a5)
@@ -270,6 +321,38 @@ packnx8_k2:
     addi            t6, t6, -1
     bnez            t6, packnx8_k2
 
+packnx8_k2_end:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    vwmacc.vx       v12, t3, v2
+    lwd             s1, s3, 8(a2)
+    vwmacc.vx       v10, t2, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v14, t4, v2
+    srli            s4, s3, 16
+    vwmacc.vx       v16, s1, v2
+    vwmacc.vx       v20, s3, v2
+    lwd             t1, t3, 16(a2)
+    vwmacc.vx       v18, s2, v2
+    srli            t2, t1, 16
+    vwmacc.vx       v22, s4, v2
+    srli            t4, t3, 16
+
+    vwmacc.vx       v8, t1, v4
+    vwmacc.vx       v12, t3, v4
+    lwd             s1, s3, 24(a2)
+    addi            a2, a2, 32
+    vwmacc.vx       v10, t2, v4
+    srli            s2, s1, 16
+    vwmacc.vx       v14, t4, v4
+    srli            s4, s3, 16
+    vwmacc.vx       v16, s1, v4
+    vwmacc.vx       v20, s3, v4
+    vwmacc.vx       v18, s2, v4
+    vwmacc.vx       v22, s4, v4
+
 packnx8_end:
     vsetvli         zero, zero, e32, m2
     vse32.v         v8, (a0)
@@ -313,6 +396,8 @@ packnx4_start:
     srli            t4, t3, 16
 
     srai            t6, a3, 1   // k2
+    addi            t6, t6, -1  // k2_end
+    beqz            t6, packnx4_k2_end
 
 packnx4_k2:
     vle16.v         v4, (a5)
@@ -341,6 +426,24 @@ packnx4_k2:
     addi            t6, t6, -1
     bnez            t6, packnx4_k2
 
+packnx4_k2_end:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    lwd             s1, s3, 8(a2)
+    vwmacc.vx       v12, t3, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v10, t2, v2
+    srli            s4, s3, 16
+    vwmacc.vx       v14, t4, v2
+    addi            a2, a2, 16
+
+    vwmacc.vx       v8, s1, v4
+    vwmacc.vx       v12, s3, v4
+    vwmacc.vx       v10, s2, v4
+    vwmacc.vx       v14, s4, v4
+
 packnx4_end:
     vsetvli         zero, zero, e32, m2
     vse32.v         v8, (a0)
@@ -371,6 +474,8 @@ packnx2_start:
     lh              t2, 2(a2)
 
     srai            t6, a3, 1   // k2
+    addi            t6, t6, -1  // k2_end
+    beqz            t6, packnx2_k2_end
 
 packnx2_k2:
     vle16.v         v4, (a5)
@@ -393,6 +498,19 @@ packnx2_k2:
     addi            t6, t6, -1
     bnez            t6, packnx2_k2
 
+packnx2_k2_end:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    lh              s1, 4(a2)
+    vwmacc.vx       v10, t2, v2
+    lh              s2, 6(a2)
+    addi            a2, a2, 8
+
+    vwmacc.vx       v8, s1, v4
+    vwmacc.vx       v10, s2, v4
+
 packnx2_end:
     vsetvli         zero, zero, e32, m2
     vse32.v         v8, (a0)
@@ -416,6 +534,8 @@ packnx1_start:
     lh              t1, 0(a2)
 
     srai            t6, a3, 1   // k2
+    addi            t6, t6, -1  // k2_end
+    beqz            t6, packnx1_k2_end
 
 packnx1_k2:
     vle16.v         v4, (a5)
@@ -434,6 +554,16 @@ packnx1_k2:
     addi            t6, t6, -1
     bnez            t6, packnx1_k2
 
+packnx1_k2_end:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    lh              s1, 2(a2)
+    addi            a2, a2, 4
+
+    vwmacc.vx       v8, s1, v4
+
 packnx1_end:
     vsetvli         zero, zero, e32, m2
     vse32.v         v8, (a0)
diff --git a/source/c908_opt/int8/gemm_int16_packn.c b/source/c908_opt/int8/gemm_int16_packn.c
index cf7f1d5f..ae4117c5 100644
--- a/source/c908_opt/int8/gemm_int16_packn.c
+++ b/source/c908_opt/int8/gemm_int16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void gemm_int16_ncxhwx_12xpackn(int32_t *output, const int16_t *kernel, const int16_t *input, int k,
                                 int n);
diff --git a/source/c908_opt/int8/gemm_int8_dot.c b/source/c908_opt/int8/gemm_int8_dot.c
index 8324e62f..10df3fcf 100644
--- a/source/c908_opt/int8/gemm_int8_dot.c
+++ b/source/c908_opt/int8/gemm_int8_dot.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: VLEN = 128
diff --git a/source/c908_opt/int8/gemm_int8_dot_ncxhwx.S b/source/c908_opt/int8/gemm_int8_dot_ncxhwx.S
index 3a158137..4a321948 100644
--- a/source/c908_opt/int8/gemm_int8_dot_ncxhwx.S
+++ b/source/c908_opt/int8/gemm_int8_dot_ncxhwx.S
@@ -133,6 +133,8 @@ packnx12_start:
 
     srai            s0, a5, 3   // k8(k2)
     beqz            s0, packnx12_k1
+    addi            s0, s0, -1  // k8(k2)_end
+    beqz            s0, packnx12_k2_end
 
 packnx12_k2:
     vle32.v         v6, (s9)
@@ -185,10 +187,60 @@ packnx12_k2:
     addi            s0, s0, -1
     bnez            s0, packnx12_k2
 
-packnx12_k1:
+packnx12_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    vmaqa.vx        v12, t3, v4
+    lwd             s1, s2, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, t5, v4
+    lwd             s3, s4, 0(a2)
+    lwd             s5, s6, 8(a2)
+    vmaqa.vx        v18, t6, v4
+    vmaqa.vx        v20, s1, v4
+    vmaqa.vx        v22, s2, v4
+    lwd             t1, t2, 16(a2)
+    lwd             t3, t4, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v24, s3, v4
+    vmaqa.vx        v26, s4, v4
+    lwd             t5, t6, 0(a2)
+    vmaqa.vx        v28, s5, v4
+    vmaqa.vx        v30, s6, v4
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 8(a2)
+    lwd             s3, s4, 16(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    lwd             s5, s6, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v16, t5, v6
+    vmaqa.vx        v18, t6, v6
+    vmaqa.vx        v20, s1, v6
+    vmaqa.vx        v22, s2, v6
+    vmaqa.vx        v24, s3, v6
+    vmaqa.vx        v26, s4, v6
+    vmaqa.vx        v28, s5, v6
+    vmaqa.vx        v30, s6, v6
+
     andi            s0, a5, 4   // k4(k1)
     beqz            s0, packnx12_post
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    lwd             t5, t6, 16(a2)
+
+packnx12_k1:
     vmaqa.vx        v8, t1, v4
     vmaqa.vx        v10, t2, v4
     vmaqa.vx        v12, t3, v4
@@ -388,6 +440,8 @@ packnx8_start:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx8_k1
+    addi            s0, s0, -1  // k8(k2)_end
+    beqz            s0, packnx8_k2_end
 
 packnx8_k2:
     vle32.v         v6, (s9)
@@ -427,10 +481,47 @@ packnx8_k2:
     addi            s0, s0, -1
     bnez            s0, packnx8_k2
 
-packnx8_k1:
+packnx8_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx8_post
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+packnx8_k1:
     vmaqa.vx        v8, t1, v4
     vmaqa.vx        v10, t2, v4
     lwd             s1, s2, 16(a2)
@@ -495,6 +586,8 @@ packnx4_start:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx4_k1
+    addi            s0, s0, -1  // k8(k2)_end
+    beqz            s0, packnx4_k2_end
 
 packnx4_k2:
     vle32.v         v6, (s9)
@@ -521,10 +614,34 @@ packnx4_k2:
     addi            s0, s0, -1
     bnez            s0, packnx4_k2
 
-packnx4_k1:
+packnx4_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx4_post
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+packnx4_k1:
     vmaqa.vx        v8, t1, v4
     vmaqa.vx        v10, t2, v4
     vmaqa.vx        v12, t3, v4
@@ -569,6 +686,8 @@ packnx2_start:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx2_k1
+    addi            s0, s0, -1  // k8(k2)_end
+    beqz            s0, packnx2_k2_end
 
 packnx2_k2:
     vle32.v         v6, (s9)
@@ -589,10 +708,28 @@ packnx2_k2:
     addi            s0, s0, -1
     bnez            s0, packnx2_k2
 
-packnx2_k1:
+packnx2_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx2_post
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+
+packnx2_k1:
     vmaqa.vx        v8, t1, v4
     vmaqa.vx        v10, t2, v4
     addi            a2, a2, 8
@@ -628,6 +765,8 @@ packnx1_start:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx1_k1
+    addi            s0, s0, -1  // k8(k2)_end
+    beqz            s0, packnx1_k2_end
 
 packnx1_k2:
     vle32.v         v6, (s9)
@@ -646,10 +785,26 @@ packnx1_k2:
     addi            s0, s0, -1
     bnez            s0, packnx1_k2
 
-packnx1_k1:
+packnx1_k2_end:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vmaqa.vx        v8, s1, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx1_post
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lw              t1, 0(a2)
+
+packnx1_k1:
     vmaqa.vx        v8, t1, v4
     addi            a2, a2, 4
 
@@ -778,6 +933,8 @@ packnx8_start_1:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx8_k1_1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, packnx8_k2_end_1
 
 packnx8_k2_1:
     vle32.v         v6, (s9)
@@ -817,10 +974,47 @@ packnx8_k2_1:
     addi            s0, s0, -1
     bnez            s0, packnx8_k2_1
 
-packnx8_k1_1:
+packnx8_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx8_post_1
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+packnx8_k1_1:
     vmaqa.vx        v8, t1, v4
     vmaqa.vx        v10, t2, v4
     lwd             s1, s2, 16(a2)
@@ -890,6 +1084,8 @@ packnx4_start_1:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx4_k1_1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, packnx4_k2_end_1
 
 packnx4_k2_1:
     vle32.v         v6, (s9)
@@ -916,10 +1112,34 @@ packnx4_k2_1:
     addi            s0, s0, -1
     bnez            s0, packnx4_k2_1
 
-packnx4_k1_1:
+packnx4_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx4_post_1
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+packnx4_k1_1:
     vmaqa.vx        v8, t1, v4
     vmaqa.vx        v10, t2, v4
     vmaqa.vx        v12, t3, v4
@@ -964,6 +1184,8 @@ packnx2_start_1:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx2_k1_1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, packnx2_k2_end_1
 
 packnx2_k2_1:
     vle32.v         v6, (s9)
@@ -984,10 +1206,28 @@ packnx2_k2_1:
     addi            s0, s0, -1
     bnez            s0, packnx2_k2_1
 
-packnx2_k1_1:
+packnx2_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx2_post_1
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+
+packnx2_k1_1:
     vmaqa.vx        v8, t1, v4
     vmaqa.vx        v10, t2, v4
     addi            a2, a2, 8
@@ -1023,6 +1263,8 @@ packnx1_start_1:
 
     srai            s0, a5, 3   // k2
     beqz            s0, packnx1_k1_1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, packnx1_k2_end_1
 
 packnx1_k2_1:
     vle32.v         v6, (s9)
@@ -1041,10 +1283,26 @@ packnx1_k2_1:
     addi            s0, s0, -1
     bnez            s0, packnx1_k2_1
 
-packnx1_k1_1:
+packnx1_k2_end_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vmaqa.vx        v8, s1, v6
+
     andi            s0, a5, 4   // k1
     beqz            s0, packnx1_post_1
 
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lw              t1, 0(a2)
+
+packnx1_k1_1:
     vmaqa.vx        v8, t1, v4
     addi            a2, a2, 4
 
diff --git a/source/c908_opt/int8/gemm_int8_dot_packn.c b/source/c908_opt/int8/gemm_int8_dot_packn.c
index 33fb4dbf..c7658bb6 100644
--- a/source/c908_opt/int8/gemm_int8_dot_packn.c
+++ b/source/c908_opt/int8/gemm_int8_dot_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void gemm_int8_dot_ncxhwx_12xpackn(int8_t *output, const int8_t *kernel, const int8_t *input,
                                    const int32_t *bias, int m, int k, int n, int32_t out_zp,
@@ -26,8 +26,8 @@ void gemm_int8_dot_ncxhwx_8xpackn(int8_t *output, const int8_t *kernel, const in
                                   int32_t *mult, int32_t *shift);
 
 void shl_c908_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                            const int32_t *bias, int m, int k, int n,
-                                            int32_t out_zp, int32_t *mult, int32_t *shift)
+                                            int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                            int32_t *mult, int32_t *shift)
 {
     const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
 
diff --git a/source/c908_opt/int8/gemm_int8_dot_v256.c b/source/c908_opt/int8/gemm_int8_dot_v256.c
index c30a7b7c..b875769d 100644
--- a/source/c908_opt/int8/gemm_int8_dot_v256.c
+++ b/source/c908_opt/int8/gemm_int8_dot_v256.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /*************************************************************
  * note: VLEN = 256
diff --git a/source/c908_opt/int8/gemm_int8_ncxhwx.S b/source/c908_opt/int8/gemm_int8_ncxhwx.S
index 3e6e36fd..99d12821 100644
--- a/source/c908_opt/int8/gemm_int8_ncxhwx.S
+++ b/source/c908_opt/int8/gemm_int8_ncxhwx.S
@@ -119,6 +119,8 @@ pack2nx4_start:
 
     srai            s0, a5, 1   // k2
     beqz            s0, pack2nx4_k1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, pack2nx4_k2_end
 
 pack2nx4_k2:
     vsetvli         zero, a4, e8, m1
@@ -163,10 +165,53 @@ pack2nx4_k2:
     addi            s0, s0, -1
     bnez            s0, pack2nx4_k2
 
-pack2nx4_k1:
+pack2nx4_k2_end:
+    vsetvli         zero, a4, e8, m1
+    vle8.v          v5, (s9)
+    add             s9, s9, a4  // kernel_data += packn
+
+    vwmul.vx        v6, v4, t1
+    lb              s1, 4(a2)
+    vwmul.vx        v8, v4, t2
+    vwmul.vx        v10, v4, t3
+    lb              s2, 5(a2)
+    vwmul.vx        v12, v4, t4
+
+    vsetvli         zero, a4, e16, m2
+    vwmacc.vx       v16, t0, v6
+    lb              s3, 6(a2)
+    vwmacc.vx       v20, t0, v8
+    vwmacc.vx       v24, t0, v10
+    lb              s4, 7(a2)
+    vwmacc.vx       v28, t0, v12
+    addi            a2, a2, 8   // input_data += 8
+
+    vsetvli         zero, a4, e8, m1
+
+    vwmul.vx        v6, v5, s1
+    vwmul.vx        v8, v5, s2
+    vwmul.vx        v10, v5, s3
+    vwmul.vx        v12, v5, s4
+
+    vsetvli         zero, a4, e16, m2
+    vwmacc.vx       v16, t0, v6
+    vwmacc.vx       v20, t0, v8
+    vwmacc.vx       v24, t0, v10
+    vwmacc.vx       v28, t0, v12
+
     andi            s0, a5, 1   // k1
     beqz            s0, pack2nx4_post
 
+    // pre-load kernel_data
+    vle8.v          v4, (s9)
+    add             s9, s9, a4  // +pack2n
+    // pre-load input_data
+    lb              t1, 0(a2)
+    lb              t2, 1(a2)
+    lb              t3, 2(a2)
+    lb              t4, 3(a2)
+
+pack2nx4_k1:
     vsetvli         zero, a4, e8, m1
     vwmul.vx        v6, v4, t1
     vwmul.vx        v8, v4, t2
@@ -239,6 +284,8 @@ pack2nx2_start:
 
     srai            s0, a5, 1   // k2
     beqz            s0, pack2nx2_k1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, pack2nx2_k2_end
 
 pack2nx2_k2:
     vsetvli         zero, a4, e8, m1
@@ -269,10 +316,39 @@ pack2nx2_k2:
     addi            s0, s0, -1
     bnez            s0, pack2nx2_k2
 
-pack2nx2_k1:
+pack2nx2_k2_end:
+    vsetvli         zero, a4, e8, m1
+    vle8.v          v5, (s9)
+    add             s9, s9, a4  // kernel_data += packn
+
+    vwmul.vx        v6, v4, t1
+    lb              s1, 2(a2)
+    vwmul.vx        v8, v4, t2
+    lb              s2, 3(a2)
+    vsetvli         zero, a4, e16, m2
+    addi            a2, a2, 4
+    vwmacc.vx       v16, t0, v6
+    vwmacc.vx       v20, t0, v8
+
+    vsetvli         zero, a4, e8, m1
+
+    vwmul.vx        v6, v5, s1
+    vwmul.vx        v8, v5, s2
+    vsetvli         zero, a4, e16, m2
+    vwmacc.vx       v16, t0, v6
+    vwmacc.vx       v20, t0, v8
+
     andi            s0, a5, 1   // k1
     beqz            s0, pack2nx2_post
 
+    // pre-load kernel_data
+    vle8.v          v4, (s9)
+    add             s9, s9, a4  // +pack2n
+    // pre-load input_data
+    lb              t1, 0(a2)
+    lb              t2, 1(a2)
+
+pack2nx2_k1:
     vsetvli         zero, a4, e8, m1
     vwmul.vx        v6, v4, t1
     vwmul.vx        v8, v4, t2
@@ -323,6 +399,8 @@ pack2nx1_start:
 
     srai            s0, a5, 1   // k2
     beqz            s0, pack2nx1_k1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, pack2nx1_k2_end
 
 pack2nx1_k2:
     vsetvli         zero, a4, e8, m1
@@ -347,10 +425,33 @@ pack2nx1_k2:
     addi            s0, s0, -1
     bnez            s0, pack2nx1_k2
 
-pack2nx1_k1:
+pack2nx1_k2_end:
+    vsetvli         zero, a4, e8, m1
+    vle8.v          v5, (s9)
+    add             s9, s9, a4  // kernel_data += packn
+
+    vwmul.vx        v6, v4, t1
+    lb              s1, 1(a2)
+    vsetvli         zero, a4, e16, m2
+    addi            a2, a2, 2
+    vwmacc.vx       v16, t0, v6
+
+    vsetvli         zero, a4, e8, m1
+
+    vwmul.vx        v6, v5, s1
+    vsetvli         zero, a4, e16, m2
+    vwmacc.vx       v16, t0, v6
+
     andi            s0, a5, 1   // k1
     beqz            s0, pack2nx1_post
 
+    // pre-load kernel_data
+    vle8.v          v4, (s9)
+    add             s9, s9, a4  // +pack2n
+    // pre-load input_data
+    lb              t1, 0(a2)
+
+pack2nx1_k1:
     vsetvli         zero, a4, e8, m1
     vwmul.vx        v6, v4, t1
     addi            a2, a2, 1   // input_data += 1
@@ -486,6 +587,8 @@ packnx4_start:
 
     srai            s0, a5, 1   // k2
     beqz            s0, packnx4_k1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, packnx4_k2_end
 
 packnx4_k2:
     vsetvli         zero, a4, e8, mf2
@@ -530,10 +633,53 @@ packnx4_k2:
     addi            s0, s0, -1
     bnez            s0, packnx4_k2
 
-packnx4_k1:
+packnx4_k2_end:
+    vsetvli         zero, a4, e8, mf2
+    vle8.v          v5, (s9)
+    add             s9, s9, a4  // kernel_data += packn
+
+    vwmul.vx        v6, v4, t1
+    lb              s1, 4(a2)
+    vwmul.vx        v8, v4, t2
+    vwmul.vx        v10, v4, t3
+    lb              s2, 5(a2)
+    vwmul.vx        v12, v4, t4
+
+    vsetvli         zero, a4, e16, m1
+    vwmacc.vx       v16, t0, v6
+    lb              s3, 6(a2)
+    vwmacc.vx       v20, t0, v8
+    vwmacc.vx       v24, t0, v10
+    lb              s4, 7(a2)
+    vwmacc.vx       v28, t0, v12
+    addi            a2, a2, 8   // input_data += 8
+
+    vsetvli         zero, a4, e8, mf2
+
+    vwmul.vx        v6, v5, s1
+    vwmul.vx        v8, v5, s2
+    vwmul.vx        v10, v5, s3
+    vwmul.vx        v12, v5, s4
+
+    vsetvli         zero, a4, e16, m1
+    vwmacc.vx       v16, t0, v6
+    vwmacc.vx       v20, t0, v8
+    vwmacc.vx       v24, t0, v10
+    vwmacc.vx       v28, t0, v12
+
     andi            s0, a5, 1   // k1
     beqz            s0, packnx4_post
 
+    // pre-load kernel_data
+    vle8.v          v4, (s9)
+    add             s9, s9, a4  // +packn
+    // pre-load input_data
+    lb              t1, 0(a2)
+    lb              t2, 1(a2)
+    lb              t3, 2(a2)
+    lb              t4, 3(a2)
+
+packnx4_k1:
     vsetvli         zero, a4, e8, mf2
     vwmul.vx        v6, v4, t1
     vwmul.vx        v8, v4, t2
@@ -589,6 +735,8 @@ packnx2_start:
 
     srai            s0, a5, 1   // k2
     beqz            s0, packnx2_k1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, packnx2_k2_end
 
 packnx2_k2:
     vsetvli         zero, a4, e8, mf2
@@ -619,10 +767,39 @@ packnx2_k2:
     addi            s0, s0, -1
     bnez            s0, packnx2_k2
 
-packnx2_k1:
+packnx2_k2_end:
+    vsetvli         zero, a4, e8, mf2
+    vle8.v          v5, (s9)
+    add             s9, s9, a4  // kernel_data += packn
+
+    vwmul.vx        v6, v4, t1
+    lb              s1, 2(a2)
+    vwmul.vx        v8, v4, t2
+    lb              s2, 3(a2)
+    vsetvli         zero, a4, e16, m1
+    addi            a2, a2, 4
+    vwmacc.vx       v16, t0, v6
+    vwmacc.vx       v20, t0, v8
+
+    vsetvli         zero, a4, e8, mf2
+
+    vwmul.vx        v6, v5, s1
+    vwmul.vx        v8, v5, s2
+    vsetvli         zero, a4, e16, m1
+    vwmacc.vx       v16, t0, v6
+    vwmacc.vx       v20, t0, v8
+
     andi            s0, a5, 1   // k1
     beqz            s0, packnx2_post
 
+    // pre-load kernel_data
+    vle8.v          v4, (s9)
+    add             s9, s9, a4  // +packn
+    // pre-load input_data
+    lb              t1, 0(a2)
+    lb              t2, 1(a2)
+
+packnx2_k1:
     vsetvli         zero, a4, e8, mf2
     vwmul.vx        v6, v4, t1
     vwmul.vx        v8, v4, t2
@@ -662,6 +839,8 @@ packnx1_start:
 
     srai            s0, a5, 1   // k2
     beqz            s0, packnx1_k1
+    addi            s0, s0, -1  // k2_end
+    beqz            s0, packnx1_k2_end
 
 packnx1_k2:
     vsetvli         zero, a4, e8, mf2
@@ -686,10 +865,33 @@ packnx1_k2:
     addi            s0, s0, -1
     bnez            s0, packnx1_k2
 
-packnx1_k1:
+packnx1_k2_end:
+    vsetvli         zero, a4, e8, mf2
+    vle8.v          v5, (s9)
+    add             s9, s9, a4  // kernel_data += packn
+
+    vwmul.vx        v6, v4, t1
+    lb              s1, 1(a2)
+    vsetvli         zero, a4, e16, m1
+    addi            a2, a2, 2
+    vwmacc.vx       v16, t0, v6
+
+    vsetvli         zero, a4, e8, mf2
+
+    vwmul.vx        v6, v5, s1
+    vsetvli         zero, a4, e16, m1
+    vwmacc.vx       v16, t0, v6
+
     andi            s0, a5, 1   // k1
     beqz            s0, packnx1_post
 
+    // pre-load kernel_data
+    vle8.v          v4, (s9)
+    add             s9, s9, a4  // +packn
+    // pre-load input_data
+    lb              t1, 0(a2)
+
+packnx1_k1:
     vsetvli         zero, a4, e8, mf2
     vwmul.vx        v6, v4, t1
     addi            a2, a2, 1   // input_data += 1
diff --git a/source/c908_opt/int8/gemm_int8_packn.c b/source/c908_opt/int8/gemm_int8_packn.c
index 5182c73f..5d4b0bc8 100644
--- a/source/c908_opt/int8/gemm_int8_packn.c
+++ b/source/c908_opt/int8/gemm_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 void gemm_int8_ncxhwx_4xpack2n(int8_t *output, const int8_t *kernel, const int8_t *input,
                                const int32_t *bias, int m, int k, int n, int32_t out_zp,
@@ -26,7 +26,7 @@ void gemm_int8_ncxhwx_4xpackn(int8_t *output, const int8_t *kernel, const int8_t
                               int32_t *mult, int32_t *shift);
 
 void shl_c908_ncxhwx_gemm_4xpack2n_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                        const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                        int32_t *bias, int m, int k, int n, int32_t out_zp,
                                         int32_t *mult, int32_t *shift)
 {
     const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
diff --git a/source/c908_opt/int8/maxpool.c b/source/c908_opt/int8/maxpool.c
index 34a52318..36a405a0 100644
--- a/source/c908_opt/int8/maxpool.c
+++ b/source/c908_opt/int8/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global maxpool2d // TODO: remove
diff --git a/source/c908_opt/reorder.c b/source/c908_opt/reorder.c
index de211d30..9d9f840d 100644
--- a/source/c908_opt/reorder.c
+++ b/source/c908_opt/reorder.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 /************************************************************************
  * reorder kernel matrix
diff --git a/source/c908_opt/setup.c b/source/c908_opt/setup.c
index 4c3186f1..c4d27bf4 100644
--- a/source/c908_opt/setup.c
+++ b/source/c908_opt/setup.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c908.h"
+#include "c908/c908.h"
 
 #define C908_OP_PATTERN_MAX 60
 static struct shl_cb_table shl_c908_cb_table[C908_OP_PATTERN_MAX];
diff --git a/source/c920_opt/CMakeLists.txt b/source/c920_opt/CMakeLists.txt
new file mode 100644
index 00000000..226658c0
--- /dev/null
+++ b/source/c920_opt/CMakeLists.txt
@@ -0,0 +1,47 @@
+
+if(CONFIG_C920_SOURCE)
+    list(APPEND C920_SRCS source/c920_opt/capability.c)
+    list(APPEND C920_SRCS source/c920_opt/reorder.c)
+    list(APPEND C920_SRCS source/c920_opt/setup.c)
+    list(APPEND C920_SRCS source/c920_opt/utils.c)
+    list(APPEND C920_SRCS source/c920_opt/yolov5.c)
+    list(APPEND C920_SRCS source/c920_opt/yolox.c)
+    list(APPEND C920_SRCS source/c920_opt/shl_c920_f32_to_i8.S)
+    list(APPEND C920_SRCS source/c920_opt/shl_c920_f32_to_u8.S)
+    list(APPEND C920_SRCS source/c920_opt/shl_c920_i8_to_f32.S)
+    list(APPEND C920_SRCS source/c920_opt/shl_c920_u8_to_f32.S)
+endif()
+
+if(CONFIG_C920_CONVOLUTION_FP32)
+    list(APPEND C920_SRCS source/c920_opt/fp32/convolution_1x1_fp32_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp32/convolution_3x3_fp32_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp32/convolution_gemm_fp32_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp32/convolution.c)
+    list(APPEND C920_SRCS source/c920_opt/fp32/gemm_fp32_packn.c)
+endif()
+
+if(CONFIG_C920_CONVOLUTION_FP16)
+    list(APPEND C920_SRCS source/c920_opt/fp16/convolution_1x1_fp16_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp16/convolution_3x3_fp16_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp16/convolution_gemm_fp16_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp16/convolution.c)
+    list(APPEND C920_SRCS source/c920_opt/fp16/gemm_fp16_packn.c)
+endif()
+
+if(CONFIG_C920_GEMM_FP32)
+    list(APPEND C920_SRCS source/c920_opt/fp32/gemm_fp32_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp32/gemm_fp32_block.c)
+endif()
+
+if(CONFIG_C920_GEMM_FP16)
+    list(APPEND C920_SRCS source/c920_opt/fp16/gemm_fp16_packn.c)
+    list(APPEND C920_SRCS source/c920_opt/fp16/gemm_fp16_block.c)
+endif()
+
+if(CONFIG_C920_MATMUL_FP32)
+    list(APPEND C920_SRCS source/c920_opt/fp32/matmul_fp32.c)
+endif()
+
+if(CONFIG_C920_MATMUL_FP16)
+    list(APPEND C920_SRCS source/c920_opt/fp16/matmul_fp16.c)
+endif()
diff --git a/source/c920_opt/Kconfig b/source/c920_opt/Kconfig
new file mode 100644
index 00000000..c9f2abe7
--- /dev/null
+++ b/source/c920_opt/Kconfig
@@ -0,0 +1,52 @@
+
+menu "C920 optimization"
+
+config C920_SOURCE
+	bool "SHL C920 optimization code"
+	default y
+	help
+		Select SHL build C920 optimization
+
+config C920_CONVOLUTION_FP32
+	depends on C920_SOURCE
+	bool "Layer convolution fp32"
+	default y
+	help
+		Select SHL build v extension optimized convolution
+
+config C920_CONVOLUTION_FP16
+	depends on C920_SOURCE
+	bool "Layer convolution fp16"
+	default y
+	help
+		Select SHL build v extension optimized convolution
+
+config C920_GEMM_FP32
+	depends on C920_SOURCE
+	bool "Layer GEMM fp32"
+	default y
+	help
+		Select SHL build v extension optimized gemm
+
+config C920_GEMM_FP16
+	depends on C920_SOURCE
+	bool "Layer GEMM fp16"
+	default y
+	help
+		Select SHL build v extension optimized gemm
+
+config C920_MATMUL_FP32
+	depends on C920_SOURCE
+	bool "Layer matmul fp32"
+	default y
+	help
+		Select SHL build v extension optimized matmul
+
+config C920_MATMUL_FP16
+	depends on C920_SOURCE
+	bool "Layer matmul fp16"
+	default y
+	help
+		Select SHL build v extension optimized matmul
+
+endmenu
diff --git a/source/c920_opt/capability.c b/source/c920_opt/capability.c
index 82e4877d..e73d8955 100644
--- a/source/c920_opt/capability.c
+++ b/source/c920_opt/capability.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920_cap.h"
+#include "c920/cap.h"
 
 static int common_all_support(struct csinn_tensor *input, struct csinn_params_base *base)
 {
@@ -37,25 +37,24 @@ int shl_c920_conv2d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_c920_matmul_cap(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                         struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
-    const int dims_count = mat0->dim_count;
     int batches_a = 1;
     int batches_b = 1;
 
     /* compute the outer size */
-    for (int i = 0; i < dims_count - 2; i++) {
+    for (int i = 0; i < mat0->dim_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     if (mat0->dtype == CSINN_DTYPE_FLOAT32 && mat1->dtype == CSINN_DTYPE_FLOAT32 ||
         mat0->dtype == CSINN_DTYPE_FLOAT16 &&
             (mat1->dtype == CSINN_DTYPE_FLOAT16 || mat1->dtype == CSINN_DTYPE_INT8)) {
-        if (batches_a == batches_b) {
-            if (!params->trans_a && !params->trans_b) {
+        if (!params->trans_a && !params->trans_b) {
+            if (batches_a == batches_b) {
                 return CSINN_OPT_INTRINSIC;
-            }
-        } else if (batches_a > 1 && batches_b == 1) {
-            if (!params->trans_a && !params->trans_b) {
+            } else if (batches_a > 1 && batches_b == 1) {
                 return CSINN_OPT_INTRINSIC;
             }
         }
diff --git a/source/c920_opt/convolution.c b/source/c920_opt/convolution.c
deleted file mode 100644
index 9cb9c6a6..00000000
--- a/source/c920_opt/convolution.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "shl_c920.h"
-
-int shl_c920_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                              struct csinn_conv2d_params *params)
-{
-    int32_t out_c = kernel->dim[0] / params->group;
-    int32_t in_c = kernel->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t kernel_h = kernel->dim[2];
-    int32_t kernel_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
-    struct csinn_callback *cb = params->base.cb;
-
-    const int packn = csrr_vlenb() / sizeof(float);
-    int in_elempack = 1;
-    int out_elempack = 1;
-    struct csinn_session *sess = params->base.sess;
-    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
-        struct shl_c920_option *option = shl_c920_get_graph_option(sess);
-        if (option && option->base.use_packn_layout) {
-            in_elempack = in_c % packn == 0 ? packn : 1;
-            out_elempack = out_c % packn == 0 ? packn : 1;
-        }
-        /* first layer do not convert input layout */
-        if (shl_is_first_layer_input(input, sess)) {
-            in_elempack = 1;
-        }
-    }
-
-    bool binary_model_op_init = shl_c920_get_binary_model_op_init(sess);
-
-    // packn
-    if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            params->conv_extra.conv_mode = CSINN_GEMM;
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params);
-            }
-            cb->exec = shl_c920_conv1x1s1_gemm_packn_fp32;
-        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-                   dalition_h == 1 && dalition_w == 1) {
-            if (params->group > 1) {
-                params->conv_extra.conv_mode = CSINN_GEMM;
-                if (!binary_model_op_init) {
-                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
-                }
-                cb->exec = shl_rvv_conv_im2col_gemm_packn_fp32;
-                return CSINN_TRUE;
-            } else {
-                params->conv_extra.conv_mode = CSINN_WINOGRAD;
-
-                // TODO: params->conv_extra.kernel_tm in binary model
-                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
-                if ((in_h < 13) && (in_w < 13)) {
-                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
-                    cb->exec = shl_c920_wg_b4f3s1_packn_fp32;
-                } else {
-                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
-                    cb->exec = shl_c920_wg_b6f3s1_packn_fp32;
-                }
-                params->conv_extra.kernel_tm = t_kernel;
-            }
-        } else {
-            params->conv_extra.conv_mode = CSINN_GEMM;
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
-            }
-            cb->exec = shl_c920_conv_im2col_gemm_packn_fp32;
-        }
-    }
-
-    // pack1ton
-    if (in_elempack % packn != 0 && out_elempack % packn == 0) {
-        params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
-            }
-            cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp32;
-        } else {
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
-            }
-            cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp32;
-        }
-    }
-
-    // packnto1
-    if (in_elempack % packn == 0 && out_elempack % packn != 0) {
-        params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params);
-            }
-            cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp32;
-        } else {
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
-            }
-            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp32;
-        }
-    }
-
-    // pack1
-    if (in_elempack % packn != 0 && out_elempack % packn != 0) {
-        params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params);
-            }
-            cb->exec = shl_rvv_conv1x1s1_gemm_fp32;
-        } else {
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(kernel, params);
-            }
-            cb->exec = shl_rvv_conv_im2col_gemm_fp32;
-        }
-    }
-    return CSINN_TRUE;
-}
-
-int shl_c920_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                              struct csinn_conv2d_params *params)
-{
-    int32_t out_c = kernel->dim[0] / params->group;
-    int32_t in_c = kernel->dim[1];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t kernel_h = kernel->dim[2];
-    int32_t kernel_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
-    struct csinn_callback *cb = params->base.cb;
-
-    const int packn = csrr_vlenb() / sizeof(__fp16);
-    int in_elempack = 1;
-    int out_elempack = 1;
-    struct csinn_session *sess = params->base.sess;
-    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
-        struct shl_c920_option *option = shl_c920_get_graph_option(sess);
-        if (option && option->base.use_packn_layout) {
-            in_elempack = in_c % packn == 0 ? packn : 1;
-            out_elempack = out_c % packn == 0 ? packn : 1;
-        }
-        /* first layer do not convert input layout */
-        if (shl_is_first_layer_input(input, sess)) {
-            in_elempack = 1;
-        }
-    }
-
-    bool binary_model_op_init = shl_c920_get_binary_model_op_init(sess);
-
-    // packn
-    if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            params->conv_extra.conv_mode = CSINN_GEMM;
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
-            }
-            cb->exec = shl_c920_conv1x1s1_gemm_packn_fp16;
-        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-                   dalition_h == 1 && dalition_w == 1) {
-            if (params->group > 1) {
-                params->conv_extra.conv_mode = CSINN_GEMM;
-                if (!binary_model_op_init) {
-                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
-                }
-                cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16;
-                return CSINN_TRUE;
-            } else {
-                params->conv_extra.conv_mode = CSINN_WINOGRAD;
-
-                // TODO: params->conv_extra.kernel_tm in binary model
-                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
-                if ((in_h < 13) && (in_w < 13)) {
-                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
-                    cb->exec = shl_c920_wg_b4f3s1_packn_fp16;
-                } else {
-                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
-                    cb->exec = shl_c920_wg_b6f3s1_packn_fp16;
-                }
-                params->conv_extra.kernel_tm = t_kernel;
-            }
-        } else {
-            params->conv_extra.conv_mode = CSINN_GEMM;
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
-            }
-            cb->exec = shl_c920_conv_im2col_gemm_packn_fp16;
-        }
-    }
-
-    // pack1ton
-    if (in_elempack % packn != 0 && out_elempack % packn == 0) {
-        params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
-            }
-            cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp16;
-        } else {
-            // if (!binary_model_op_init) {
-            //     shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
-            // }
-            // cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp16;
-
-            /* xxx: pack1 for first layer */
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
-            }
-            cb->exec = shl_rvv_conv_im2col_gemm_fp16;
-        }
-    }
-
-    // packnto1
-    if (in_elempack % packn == 0 && out_elempack % packn != 0) {
-        params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
-            }
-            cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp16;
-        } else {
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
-            }
-            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp16;
-        }
-    }
-
-    // pack1
-    if (in_elempack % packn != 0 && out_elempack % packn != 0) {
-        params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
-            if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
-            }
-            cb->exec = shl_rvv_conv1x1s1_gemm_fp16;
-        } else {
-            if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
-            }
-            cb->exec = shl_rvv_conv_im2col_gemm_fp16;
-        }
-    }
-    return CSINN_TRUE;
-}
\ No newline at end of file
diff --git a/source/c920_opt/convolution_1x1_fp16_packn.c b/source/c920_opt/convolution_1x1_fp16_packn.c
deleted file mode 100644
index 855caeb8..00000000
--- a/source/c920_opt/convolution_1x1_fp16_packn.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "shl_c920.h"
-
-int shl_c920_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                       struct csinn_conv2d_params *params)
-{
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(__fp16);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            __fp16 *kernel_ptr = kernel_data + g * m * k;
-            __fp16 *in_ptr = pb_reorder;
-            __fp16 *out_ptr = output_data;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
-            // pack
-            shl_rvv_reorder_input_z8_packn_fp16(input_data, in_ptr, k, n, n);
-            // GEMM
-            shl_c920_ncxhwx_gemm_8xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
-}
diff --git a/source/c920_opt/convolution_1x1_fp32_packn.c b/source/c920_opt/convolution_1x1_fp32_packn.c
deleted file mode 100644
index 13701f2b..00000000
--- a/source/c920_opt/convolution_1x1_fp32_packn.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "shl_c920.h"
-
-int shl_c920_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                       struct csinn_conv2d_params *params)
-{
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(float);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];  // assert(batch == 1);
-    int32_t in_ch = input->dim[1] * input->dim[4];
-    int32_t out_ch = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t m = out_ch / group;
-    int32_t k = in_ch / group;
-    int32_t n = out_h * out_w;
-
-    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            float *kernel_ptr = kernel_data + g * m * k;
-            float *in_ptr = pb_reorder;
-            float *out_ptr = output_data;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-
-            // pack
-            shl_rvv_reorder_input_z8_packn_fp32(input_data, in_ptr, k, n, n);
-            // GEMM
-            shl_c920_ncxhwx_gemm_8xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
-
-            input_data += k * n;
-            output_data += m * n;
-        }
-    }
-    shl_mem_free(pb_reorder);
-    return CSINN_TRUE;
-}
diff --git a/source/c920_opt/convolution_gemm_fp16_packn.c b/source/c920_opt/convolution_gemm_fp16_packn.c
deleted file mode 100644
index ce00f959..00000000
--- a/source/c920_opt/convolution_gemm_fp16_packn.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "shl_c920.h"
-
-int shl_c920_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
-{
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(__fp16);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    __fp16 *input_data = (__fp16 *)input->data;
-    __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    __fp16 *bias_data = (__fp16 *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
-            shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(__fp16);
-            const int vl = vsetvl_e16m1(packn);
-
-            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                         packn * sizeof(__fp16));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const __fp16 *img0 = input_pad_buf + c * padded_in_hw;
-                __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const __fp16 *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
-                                img1 += stride_w * packn;
-                                vse16_v_f16m1(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
-            shl_rvv_reorder_input_z8_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_c920_ncxhwx_gemm_8xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    // requantize
-    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
-    return CSINN_TRUE;
-}
diff --git a/source/c920_opt/convolution_gemm_fp32_packn.c b/source/c920_opt/convolution_gemm_fp32_packn.c
deleted file mode 100644
index e83500d3..00000000
--- a/source/c920_opt/convolution_gemm_fp32_packn.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "shl_c920.h"
-
-int shl_c920_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
-{
-    if (input->layout == CSINN_LAYOUT_NCHW) {
-        shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
-    }
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        const int packn = csrr_vlenb() / sizeof(float);
-        output->dim[1] /= packn;
-        output->dim[4] = packn;
-        output->dim_count = 5;
-        output->layout = CSINN_LAYOUT_NC1HWC0;
-    }
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t group = params->group;
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1] * input->dim[4];
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-    int32_t out_c = kernel->dim[0];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int32_t ksize_h = kernel->dim[2];
-    int32_t ksize_w = kernel->dim[3];
-    int32_t stride_h = params->stride_height;
-    int32_t stride_w = params->stride_width;
-    int32_t dilation_h = params->dilation_height;
-    int32_t dilation_w = params->dilation_width;
-
-    int32_t m = out_c / group;
-    int32_t in_cp = in_c / group;
-    int32_t maxk = ksize_h * ksize_w;
-    int32_t n = out_h * out_w;
-
-    for (int i = 0; i < batch; i++) {
-        for (int g = 0; g < group; g++) {
-            // padding
-            int padded_in_h = in_h + params->pad_top + params->pad_down;
-            int padded_in_w = in_w + params->pad_left + params->pad_right;
-            int padded_in_hw = padded_in_h * padded_in_w;
-            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
-            shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
-                                         padded_in_w, params->pad_top, params->pad_left);
-
-            // im2col
-            const int packn = csrr_vlenb() / sizeof(float);
-            const int vl = vsetvl_e32m1(packn);
-
-            // [in_c/packn, maxk, out_h, out_w, packn]
-            float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
-                                                       packn * sizeof(float));
-            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
-
-            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
-                const float *img0 = input_pad_buf + c * padded_in_hw;
-                float *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
-
-                for (int a = 0; a < ksize_h; a++) {
-                    for (int b = 0; b < ksize_w; b++) {
-                        const float *img1 =
-                            img0 + a * dilation_h * padded_in_w * packn + b * dilation_w * packn;
-
-                        for (int p = 0; p < out_h; p++) {
-                            for (int q = 0; q < out_w; q++) {
-                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
-                                img1 += stride_w * packn;
-                                vse32_v_f32m1(dst_ptr, _tmp, vl);
-                                dst_ptr += packn;
-                            }
-                            img1 += tailstep;
-                        }
-                    }
-                }
-            }
-            shl_mem_free(input_pad_buf);
-
-            // reorder(pack)
-            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
-            shl_rvv_reorder_input_z8_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-
-            // gemm
-            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
-            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_c920_ncxhwx_gemm_8xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
-            shl_mem_free(reorder_buf);
-
-            input_data += in_cp * in_h * in_w;
-            output_data += m * n;
-        }
-    }
-    return CSINN_TRUE;
-}
diff --git a/source/c920_opt/fp16/convolution.c b/source/c920_opt/fp16/convolution.c
new file mode 100644
index 00000000..58d57d86
--- /dev/null
+++ b/source/c920_opt/fp16/convolution.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920/c920.h"
+
+int shl_c920_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0] / params->group;
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int in_elempack = 1;
+    int out_elempack = 1;
+    struct csinn_session *sess = params->base.sess;
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_c920_option *option = shl_c920_get_graph_option(sess);
+        if (option && option->base.use_packn_layout) {
+            in_elempack = in_c % packn == 0 ? packn : 1;
+            out_elempack = out_c % packn == 0 ? packn : 1;
+        }
+        /* first layer do not convert input layout */
+        if (shl_is_first_layer_input(input, sess)) {
+            in_elempack = 1;
+        }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
+    }
+
+    bool binary_model_op_init = shl_c920_get_binary_model_op_init(sess);
+
+    // packn
+    if (in_elempack % packn == 0 && out_elempack % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920_conv1x1s1_gemm_packn_fp16;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dilation_h == 1 && dilation_w == 1) {
+            if (params->group > 1 || (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8)) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                if (!binary_model_op_init) {
+                    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                        shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                        shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                    }
+                }
+                cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+
+                // TODO: params->conv_extra.kernel_tm in binary model
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_c920_wg_b4f3s1_packn_fp16;
+                } else {
+                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_c920_wg_b6f3s1_packn_fp16;
+                }
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920_conv_im2col_gemm_packn_fp16;
+        }
+    }
+
+    // pack1ton
+    if (in_elempack % packn != 0 && out_elempack % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp16;
+        } else {
+            // if (!binary_model_op_init) {
+            //     if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+            //         shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+            //     } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+            //         shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            //     }
+            // }
+            // cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp16;
+
+            /* xxx: pack1 for first layer */
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_fp16;
+        }
+    }
+
+    // packnto1
+    if (in_elempack % packn == 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp16;
+        } else {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp16;
+        }
+    }
+
+    // pack1
+    if (in_elempack % packn != 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_fp16;
+        } else {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_fp16;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c920_opt/fp16/convolution_1x1_fp16_packn.c b/source/c920_opt/fp16/convolution_1x1_fp16_packn.c
new file mode 100644
index 00000000..bc20153f
--- /dev/null
+++ b/source/c920_opt/fp16/convolution_1x1_fp16_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920/c920.h"
+
+int shl_c920_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packn_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z8_packn_fp16,
+                                                  shl_c920_ncxhwx_gemm_8xpack2n_fp16);
+}
diff --git a/source/c920_opt/convolution_3x3_fp16_packn.c b/source/c920_opt/fp16/convolution_3x3_fp16_packn.c
similarity index 99%
rename from source/c920_opt/convolution_3x3_fp16_packn.c
rename to source/c920_opt/fp16/convolution_3x3_fp16_packn.c
index 7b0f1773..38f9d9aa 100644
--- a/source/c920_opt/convolution_3x3_fp16_packn.c
+++ b/source/c920_opt/fp16/convolution_3x3_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/c920_opt/fp16/convolution_gemm_fp16_packn.c b/source/c920_opt/fp16/convolution_gemm_fp16_packn.c
new file mode 100644
index 00000000..051e22d2
--- /dev/null
+++ b/source/c920_opt/fp16/convolution_gemm_fp16_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920/c920.h"
+
+int shl_c920_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packn_fp16(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z8_packn_fp16,
+                                               shl_c920_ncxhwx_gemm_8xpack2n_fp16);
+}
diff --git a/source/c920_opt/gemm_fp16_block.c b/source/c920_opt/fp16/gemm_fp16_block.c
similarity index 96%
rename from source/c920_opt/gemm_fp16_block.c
rename to source/c920_opt/fp16/gemm_fp16_block.c
index f9befcc8..73cb8301 100644
--- a/source/c920_opt/gemm_fp16_block.c
+++ b/source/c920_opt/fp16/gemm_fp16_block.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(__fp16)
@@ -165,14 +165,6 @@ static inline void gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp1
             vfloat16m1_t _acc50;
             vfloat16m1_t _acc60;
             vfloat16m1_t _acc70;
-            vfloat16m1_t _acc01;
-            vfloat16m1_t _acc11;
-            vfloat16m1_t _acc21;
-            vfloat16m1_t _acc31;
-            vfloat16m1_t _acc41;
-            vfloat16m1_t _acc51;
-            vfloat16m1_t _acc61;
-            vfloat16m1_t _acc71;
             if (k_idx == 0) {
                 const __fp16 *b_ptr = bias_data + i;
                 _acc00 = vfmv_v_f_f16m1(b_ptr[0], vl);
@@ -306,10 +298,6 @@ static inline void gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp1
             vfloat16m1_t _acc10;
             vfloat16m1_t _acc20;
             vfloat16m1_t _acc30;
-            vfloat16m1_t _acc01;
-            vfloat16m1_t _acc11;
-            vfloat16m1_t _acc21;
-            vfloat16m1_t _acc31;
             if (k_idx == 0) {
                 const __fp16 *b_ptr = bias_data + i;
                 _acc00 = vfmv_v_f_f16m1(b_ptr[0], vl);
@@ -399,8 +387,6 @@ static inline void gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp1
 
             vfloat16m1_t _acc00;
             vfloat16m1_t _acc10;
-            vfloat16m1_t _acc01;
-            vfloat16m1_t _acc11;
             if (k_idx == 0) {
                 const __fp16 *b_ptr = bias_data + i;
                 _acc00 = vfmv_v_f_f16m1(b_ptr[0], vl);
@@ -468,7 +454,6 @@ static inline void gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp1
             __fp16 *out0_ptr = output_data + i * ldc + j;
 
             vfloat16m1_t _acc00;
-            vfloat16m1_t _acc01;
             if (k_idx == 0) {
                 const __fp16 *b_ptr = bias_data + i;
                 _acc00 = vfmv_v_f_f16m1(b_ptr[0], vl);
@@ -493,8 +478,8 @@ static inline void gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp1
 
 /*************************************************************
  * packn = vlenb / sizeof(__fp16)
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 8
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * m_blk: M_BLK, M_tail
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  *
  * dst - output: [m, n]
@@ -524,20 +509,14 @@ void shl_c920_gemm_block_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
 
         int n_block = N_BLK;
         int n_idx = 0;
         while (n_idx < n) {
-            while (!(n_idx + n_block - 1 < n)) {
-                n_block /= 2;
-            }
-            if (n_block < MIN_N_BLK) {
+            if (n - n_idx < n_block) {
                 n_block = n - n_idx;
             }
 
diff --git a/source/c920_opt/gemm_fp16_packn.c b/source/c920_opt/fp16/gemm_fp16_packn.c
similarity index 99%
rename from source/c920_opt/gemm_fp16_packn.c
rename to source/c920_opt/fp16/gemm_fp16_packn.c
index 951500f6..f1dee96a 100644
--- a/source/c920_opt/gemm_fp16_packn.c
+++ b/source/c920_opt/fp16/gemm_fp16_packn.c
@@ -16,15 +16,16 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /**************************************************************
  * dst - output: [m/packn, n, packn]
  * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
  * sb - input:   [n/8, k, 8]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_c920_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                        __fp16 *bias, int m, int k, int n, int ldc)
+                                        __fp16 *bias, int m, int k, int n, bool fuse_relu)
 {
     __fp16 *kernel_data = (__fp16 *)sa;
     __fp16 *input_data = (__fp16 *)sb;
diff --git a/source/c920_opt/matmul_fp16.c b/source/c920_opt/fp16/matmul_fp16.c
similarity index 80%
rename from source/c920_opt/matmul_fp16.c
rename to source/c920_opt/fp16/matmul_fp16.c
index 04e9c539..6cc35978 100644
--- a/source/c920_opt/matmul_fp16.c
+++ b/source/c920_opt/fp16/matmul_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /*************************************************************
   Matmul fp16 performance on C920@1.848GHz
@@ -37,6 +37,13 @@
 int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                          struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
+    if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0);
+    }
+    if (mat1->layout >= CSINN_LAYOUT_NC1C0 && mat1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat1);
+    }
+
     __fp16 *mat0_data = (__fp16 *)mat0->data;
     __fp16 *mat1_data = (__fp16 *)mat1->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -49,7 +56,6 @@ int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
     }
-
     for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
@@ -58,8 +64,8 @@ int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
     const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1;
             if (!(mat1->is_const)) {
@@ -89,11 +95,7 @@ int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
             }
             // requantize
             shl_rvv_sidcso_op_requantize_fp16(mat0, output, mat1);
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1;
             if (!(mat1->is_const)) {
@@ -125,8 +127,7 @@ int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupported this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -135,6 +136,10 @@ int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
 int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                                 struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
+    if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0);
+    }
+
     __fp16 *mat0_data = (__fp16 *)mat0->data;
     int8_t *mat1_data = (int8_t *)mat1->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -146,12 +151,14 @@ int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *
     /* compute the outer size */
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
-    const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
+    const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
     int32_t zp = mat1->qinfo->zero_point;
     float scale = mat1->qinfo->scale;
@@ -159,8 +166,8 @@ int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *
     int api = params->base.api;
     int size1 = csinn_tensor_size(mat1);
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16));
             shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale);
@@ -178,11 +185,7 @@ int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *
             }
             shl_mem_free(in0);
             shl_mem_free(in1);
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16));
             shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale);
@@ -204,8 +207,7 @@ int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupported this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -215,22 +217,24 @@ int shl_c920_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *ma
                               struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     struct csinn_callback *cb = params->base.cb;
-    if (mat0->dtype == CSINN_DTYPE_FLOAT16) {
-        if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) {
-            shl_rvv_matmul_reorder_weight_fp16_w_int8(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
-            cb->exec = shl_c920_matmul_fp16_w_int8;
-        } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) {
-            if (mat1->is_const) {
-                shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+    if (!params->trans_a && !params->trans_b) {
+        if (mat0->dtype == CSINN_DTYPE_FLOAT16) {
+            if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_matmul_reorder_weight_fp16_w_int8(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+                cb->exec = shl_c920_matmul_fp16_w_int8;
+            } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) {
+                if (mat1->is_const) {
+                    shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+                }
+                cb->exec = shl_c920_matmul_fp16;
             }
-            cb->exec = shl_c920_matmul_fp16;
-        } else {
-            shl_debug_error("mat1 unsupported dtype: %d\n", mat1->dtype);
-            return CSINN_FALSE;
         }
-    } else {
-        shl_debug_error("mat0 unsupported dtype: %d\n", mat0->dtype);
-        return CSINN_FALSE;
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "matmul is not optimized to achieve under this condition, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_matmul_quant;
     }
     return CSINN_TRUE;
 }
diff --git a/source/c920_opt/fp32/convolution.c b/source/c920_opt/fp32/convolution.c
new file mode 100644
index 00000000..6f06f4c0
--- /dev/null
+++ b/source/c920_opt/fp32/convolution.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920/c920.h"
+
+int shl_c920_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0] / params->group;
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    int in_elempack = 1;
+    int out_elempack = 1;
+    struct csinn_session *sess = params->base.sess;
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_c920_option *option = shl_c920_get_graph_option(sess);
+        if (option && option->base.use_packn_layout) {
+            in_elempack = in_c % packn == 0 ? packn : 1;
+            out_elempack = out_c % packn == 0 ? packn : 1;
+        }
+        /* first layer do not convert input layout */
+        if (shl_is_first_layer_input(input, sess)) {
+            in_elempack = 1;
+        }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
+    }
+
+    bool binary_model_op_init = shl_c920_get_binary_model_op_init(sess);
+
+    // packn
+    if (in_elempack % packn == 0 && out_elempack % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params);
+            }
+            cb->exec = shl_c920_conv1x1s1_gemm_packn_fp32;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dilation_h == 1 && dilation_w == 1) {
+            if (params->group > 1) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                if (!binary_model_op_init) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+                }
+                cb->exec = shl_c920_conv_im2col_gemm_packn_fp32;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+
+                // TODO: params->conv_extra.kernel_tm in binary model
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_c920_wg_b4f3s1_packn_fp32;
+                } else {
+                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_c920_wg_b6f3s1_packn_fp32;
+                }
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+            }
+            cb->exec = shl_c920_conv_im2col_gemm_packn_fp32;
+        }
+    }
+
+    // pack1ton
+    if (in_elempack % packn != 0 && out_elempack % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp32;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp32;
+        }
+    }
+
+    // packnto1
+    if (in_elempack % packn == 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp32;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp32;
+        }
+    }
+
+    // pack1
+    if (in_elempack % packn != 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_fp32;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_fp32;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c920_opt/fp32/convolution_1x1_fp32_packn.c b/source/c920_opt/fp32/convolution_1x1_fp32_packn.c
new file mode 100644
index 00000000..abd3a545
--- /dev/null
+++ b/source/c920_opt/fp32/convolution_1x1_fp32_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920/c920.h"
+
+int shl_c920_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packn_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z8_packn_fp32,
+                                                  shl_c920_ncxhwx_gemm_8xpack2n_fp32);
+}
diff --git a/source/c920_opt/convolution_3x3_fp32_packn.c b/source/c920_opt/fp32/convolution_3x3_fp32_packn.c
similarity index 99%
rename from source/c920_opt/convolution_3x3_fp32_packn.c
rename to source/c920_opt/fp32/convolution_3x3_fp32_packn.c
index ffce7514..7219f8e1 100644
--- a/source/c920_opt/convolution_3x3_fp32_packn.c
+++ b/source/c920_opt/fp32/convolution_3x3_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/c920_opt/fp32/convolution_gemm_fp32_packn.c b/source/c920_opt/fp32/convolution_gemm_fp32_packn.c
new file mode 100644
index 00000000..75742542
--- /dev/null
+++ b/source/c920_opt/fp32/convolution_gemm_fp32_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920/c920.h"
+
+int shl_c920_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packn_fp32(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z8_packn_fp32,
+                                               shl_c920_ncxhwx_gemm_8xpack2n_fp32);
+}
diff --git a/source/c920_opt/gemm_fp32_block.c b/source/c920_opt/fp32/gemm_fp32_block.c
similarity index 98%
rename from source/c920_opt/gemm_fp32_block.c
rename to source/c920_opt/fp32/gemm_fp32_block.c
index fdb78e9c..70a09e21 100644
--- a/source/c920_opt/gemm_fp32_block.c
+++ b/source/c920_opt/fp32/gemm_fp32_block.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(float)
@@ -478,8 +478,8 @@ static inline void gemm_8xpack2n_fp32(float *dst, const float *sa, const float *
 
 /*************************************************************
  * packn = vlenb / sizeof(float)
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 8
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * m_blk: M_BLK, M_tail
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  *
  * dst - output: [m, n]
@@ -509,20 +509,14 @@ void shl_c920_gemm_block_8xpack2n_fp32(float *dst, const float *sa, const float
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
 
         int n_block = N_BLK;
         int n_idx = 0;
         while (n_idx < n) {
-            while (!(n_idx + n_block - 1 < n)) {
-                n_block /= 2;
-            }
-            if (n_block < MIN_N_BLK) {
+            if (n - n_idx < n_block) {
                 n_block = n - n_idx;
             }
 
diff --git a/source/c920_opt/gemm_fp32_packn.c b/source/c920_opt/fp32/gemm_fp32_packn.c
similarity index 99%
rename from source/c920_opt/gemm_fp32_packn.c
rename to source/c920_opt/fp32/gemm_fp32_packn.c
index 33de5652..4a6dc7d9 100644
--- a/source/c920_opt/gemm_fp32_packn.c
+++ b/source/c920_opt/fp32/gemm_fp32_packn.c
@@ -16,15 +16,16 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /**************************************************************
  * dst - output: [m/packn, n, packn]
  * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
  * sb - input:   [n/8, k, 8]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_c920_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
-                                        int m, int k, int n, int ldc)
+                                        int m, int k, int n, bool fuse_relu)
 {
     float *kernel_data = (float *)sa;
     float *input_data = (float *)sb;
diff --git a/source/c920_opt/matmul_fp32.c b/source/c920_opt/fp32/matmul_fp32.c
similarity index 81%
rename from source/c920_opt/matmul_fp32.c
rename to source/c920_opt/fp32/matmul_fp32.c
index c44eb06b..417db344 100644
--- a/source/c920_opt/matmul_fp32.c
+++ b/source/c920_opt/fp32/matmul_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /*************************************************************
   Matmul fp32 performance on C920@1.848GHz
@@ -37,6 +37,13 @@
 int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                          struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
+    if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat0);
+    }
+    if (mat1->layout >= CSINN_LAYOUT_NC1C0 && mat1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat1);
+    }
+
     float *mat0_data = (float *)mat0->data;
     float *mat1_data = (float *)mat1->data;
     float *output_data = (float *)output->data;
@@ -49,8 +56,6 @@ int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
     }
-
-    // /* compute the outer size */
     for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
@@ -59,8 +64,8 @@ int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
     const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float));
             float *in1;
             if (!(mat1->is_const)) {
@@ -88,11 +93,7 @@ int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
             if (!(mat1->is_const)) {
                 shl_mem_free(in1);
             }
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float));
             float *in1;
             if (!(mat1->is_const)) {
@@ -122,8 +123,7 @@ int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupported this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -133,19 +133,21 @@ int shl_c920_matmul_init_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma
                               struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     struct csinn_callback *cb = params->base.cb;
-    if (mat0->dtype == CSINN_DTYPE_FLOAT32) {
-        if (mat1->dtype == CSINN_DTYPE_FLOAT32) {
-            if (mat1->is_const) {
-                shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+    if (!params->trans_a && !params->trans_b) {
+        if (mat0->dtype == CSINN_DTYPE_FLOAT32) {
+            if (mat1->dtype == CSINN_DTYPE_FLOAT32) {
+                if (mat1->is_const) {
+                    shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+                }
+                cb->exec = shl_c920_matmul_fp32;
             }
-            cb->exec = shl_c920_matmul_fp32;
-        } else {
-            shl_debug_error("mat1 unsupported dtype: %d\n", mat1->dtype);
-            return CSINN_FALSE;
         }
-    } else {
-        shl_debug_error("mat0 unsupported dtype: %d\n", mat0->dtype);
-        return CSINN_FALSE;
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "matmul is not optimized to achieve under this condition, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_matmul_quant;
     }
     return CSINN_TRUE;
 }
diff --git a/source/c920_opt/reorder.c b/source/c920_opt/reorder.c
index ee3a4904..853738bb 100644
--- a/source/c920_opt/reorder.c
+++ b/source/c920_opt/reorder.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 /*************************************************************
  * src: [M_BLOCK, K_BLOCK]
@@ -109,7 +109,7 @@ static inline void reorder_kernel_8xk_fp32(float *src, float *dst, int M_BLOCK,
 /*************************************************************
  * src: [m, k]
  * dst: [m/m_blk, k/k_blk, m_blk/8, 8, k_blk]
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 8
+ * m_blk: M_BLK, M_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_c920_reorder_kernel_block_8xk_fp32(float *src, float *dst, int m, int k, const int M_BLK,
@@ -120,12 +120,10 @@ void shl_c920_reorder_kernel_block_8xk_fp32(float *src, float *dst, int m, int k
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
+
         int k_block = K_BLK;
         int k_idx = 0;
         while (k_idx < k) {
@@ -232,7 +230,7 @@ static inline void reorder_kernel_8xk_fp16(__fp16 *src, __fp16 *dst, int M_BLOCK
 /*************************************************************
  * src: [m, k]
  * dst: [m/m_blk, k/k_blk, m_blk/8, 8, k_blk]
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 8
+ * m_blk: M_BLK, M_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_c920_reorder_kernel_block_8xk_fp16(__fp16 *src, __fp16 *dst, int m, int k, const int M_BLK,
@@ -243,12 +241,10 @@ void shl_c920_reorder_kernel_block_8xk_fp16(__fp16 *src, __fp16 *dst, int m, int
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
+
         int k_block = K_BLK;
         int k_idx = 0;
         while (k_idx < k) {
diff --git a/source/c920_opt/setup.c b/source/c920_opt/setup.c
index c6d358b3..c33897d1 100644
--- a/source/c920_opt/setup.c
+++ b/source/c920_opt/setup.c
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
-#include "shl_c920_cap.h"
+#include "c920/c920.h"
+#include "c920/cap.h"
 
-#define c920_OP_PATTERN_MAX 40
-static struct shl_cb_table shl_c920_cb_table[c920_OP_PATTERN_MAX];
+#define C920_OP_PATTERN_MAX 40
+static struct shl_cb_table shl_c920_cb_table[C920_OP_PATTERN_MAX];
 
 void shl_c920_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init,
                      void *exec, void *est, void *cap)
@@ -38,7 +38,7 @@ struct csinn_callback *shl_cb_map_rvv(int op, int dtype);
 struct csinn_callback *shl_cb_map_c920(int op, int dtype)
 {
     struct csinn_callback *cb = NULL;
-    for (int i = 0; i < c920_OP_PATTERN_MAX; i++) {
+    for (int i = 0; i < C920_OP_PATTERN_MAX; i++) {
         if (shl_c920_cb_table[i].shl_cb_key == (op * CSINN_DTYPE_SIZE + dtype)) {
             cb = &(shl_c920_cb_table[i].shl_cb_value);
             break;
@@ -379,20 +379,26 @@ void *shl_c920_runtime_callback(int api)
 
 void shl_target_init_c920()
 {
+#ifndef CONFIG_C920_CONVOLUTION_FP32_DISABLED
     shl_c920_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c920_conv2d_init_fp32, NULL,
                     shl_gref_conv2d, shl_c920_conv2d_cap);
     shl_c920_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c920_conv2d_init_fp32, NULL,
                     shl_gref_group_conv2d, shl_c920_conv2d_cap);
+#endif
+#ifndef CONFIG_C920_CONVOLUTION_FP16_DISABLED
     shl_c920_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c920_conv2d_init_fp16, NULL,
                     shl_gref_conv2d, shl_c920_conv2d_cap);
     shl_c920_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c920_conv2d_init_fp16, NULL,
                     shl_gref_group_conv2d, shl_c920_conv2d_cap);
-
+#endif
+#ifndef CONFIG_C920_MATMUL_FP32_DISABLED
     shl_c920_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MATMUL, shl_c920_matmul_init_fp32, NULL,
                     shl_gref_matmul, shl_c920_matmul_cap);
+#endif
+#ifndef CONFIG_C920_MATMUL_FP16_DISABLED
     shl_c920_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, shl_c920_matmul_init_fp16, NULL,
                     shl_gref_matmul, shl_c920_matmul_cap);
-
+#endif
     shl_register_op_callback(CSINN_C920, shl_cb_map_c920);
     shl_register_runtime_callback(CSINN_C920, shl_c920_runtime_callback);
 }
diff --git a/source/c920_opt/utils.c b/source/c920_opt/utils.c
index f324bf0f..a89481d1 100644
--- a/source/c920_opt/utils.c
+++ b/source/c920_opt/utils.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 // Only dtype conversion is supported. Layout conversion is not supported.
 void *shl_c920_f32_to_input_dtype(uint32_t index, float *data, struct csinn_session *sess)
diff --git a/source/c920_opt/yolov5.c b/source/c920_opt/yolov5.c
index 2295bc58..a8fb8682 100644
--- a/source/c920_opt/yolov5.c
+++ b/source/c920_opt/yolov5.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_c920.h"
+#include "c920/c920.h"
 
 static void qsort_desc_fp32(int32_t *box_idx, float *scores, int left, int right)
 {
diff --git a/source/c920_opt/yolox.c b/source/c920_opt/yolox.c
new file mode 100644
index 00000000..5aebf013
--- /dev/null
+++ b/source/c920_opt/yolox.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920/c920.h"
+
+int shl_c920_yolox_preprocess(struct csinn_tensor *input, struct csinn_tensor *output)
+{
+    uint8_t *input_data = (uint8_t *)input->data;
+    uint8_t *output_data = (uint8_t *)output->data;
+
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int out_c = output->dim[1];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+
+    int stride = 2 * sizeof(uint8_t);
+
+    uint8_t *out_ptr = output_data;
+    for (int c = 0; c < in_c; c++) {
+        for (int h = 0; h < in_h; h += 2) {
+            uint8_t *in_ptr = input_data + (c * in_h + h) * in_w;
+            int w = 0;
+            while (w < out_w) {
+                int vl = vsetvl_e8m8(out_w - w);
+                vuint8m8_t _u8 = vlse8_v_u8m8(in_ptr, stride, vl);
+                vse8_v_u8m8(out_ptr, _u8, vl);
+                w += vl;
+                in_ptr += vl * 2;
+                out_ptr += vl;
+            }
+        }
+    }
+
+    for (int c = 0; c < in_c; c++) {
+        for (int h = 1; h < in_h; h += 2) {
+            uint8_t *in_ptr = input_data + (c * in_h + h) * in_w;
+            int w = 0;
+            while (w < out_w) {
+                int vl = vsetvl_e8m8(out_w - w);
+                vuint8m8_t _u8 = vlse8_v_u8m8(in_ptr, stride, vl);
+                vse8_v_u8m8(out_ptr, _u8, vl);
+                w += vl;
+                in_ptr += vl * 2;
+                out_ptr += vl;
+            }
+        }
+    }
+
+    for (int c = 0; c < in_c; c++) {
+        for (int h = 0; h < in_h; h += 2) {
+            uint8_t *in_ptr = input_data + (c * in_h + h) * in_w + 1;
+            int w = 0;
+            while (w < out_w) {
+                int vl = vsetvl_e8m8(out_w - w);
+                vuint8m8_t _u8 = vlse8_v_u8m8(in_ptr, stride, vl);
+                vse8_v_u8m8(out_ptr, _u8, vl);
+                w += vl;
+                in_ptr += vl * 2;
+                out_ptr += vl;
+            }
+        }
+    }
+
+    for (int c = 0; c < in_c; c++) {
+        for (int h = 1; h < in_h; h += 2) {
+            uint8_t *in_ptr = input_data + (c * in_h + h) * in_w + 1;
+            int w = 0;
+            while (w < out_w) {
+                int vl = vsetvl_e8m8(out_w - w);
+                vuint8m8_t _u8 = vlse8_v_u8m8(in_ptr, stride, vl);
+                vse8_v_u8m8(out_ptr, _u8, vl);
+                w += vl;
+                in_ptr += vl * 2;
+                out_ptr += vl;
+            }
+        }
+    }
+
+    return CSINN_TRUE;
+}
diff --git a/source/c920v2_opt/CMakeLists.txt b/source/c920v2_opt/CMakeLists.txt
new file mode 100644
index 00000000..8e027ab0
--- /dev/null
+++ b/source/c920v2_opt/CMakeLists.txt
@@ -0,0 +1,50 @@
+
+if(CONFIG_C920V2_SOURCE)
+    list(APPEND C920V2_SRCS source/c920v2_opt/utils.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/setup.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/capability.c)
+endif()
+
+if(CONFIG_C920V2_CONVOLUTION_FP32)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/convolution_1x1_fp32_pack1ton.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/convolution_1x1_fp32_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/convolution_1x1_fp32_packnto1.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/convolution_gemm_fp32_pack1ton.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/convolution_gemm_fp32_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/convolution_gemm_fp32_packnto1.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/convolution.c)
+endif()
+
+if(CONFIG_C920V2_CONVOLUTION_FP16)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/convolution_1x1_fp16_pack1ton.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/convolution_1x1_fp16_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/convolution_1x1_fp16_packnto1.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/convolution_gemm_fp16_pack1ton.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/convolution_gemm_fp16_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/convolution_gemm_fp16_packnto1.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/convolution.c)
+endif()
+
+if(CONFIG_C920V2_CONVOLUTION_INT8)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/convolution_1x1_int8_pack1ton.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/convolution_1x1_int8_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/convolution_1x1_int8_packnto1.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/convolution.c)
+endif()
+
+if(CONFIG_C920V2_GEMM_FP32)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/gemm_fp32_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp32/gemm_fp32_ncxhwx.S)
+endif()
+
+if(CONFIG_C920V2_GEMM_FP16)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/gemm_fp16_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/fp16/gemm_fp16_ncxhwx.S)
+endif()
+
+if(CONFIG_C920V2_GEMM_INT8)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/gemm_int8_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/gemm_int8_ncxhwx.S)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/gemm_int8_dot_packn.c)
+    list(APPEND C920V2_SRCS source/c920v2_opt/int8/gemm_int8_dot_ncxhwx.S)
+endif()
diff --git a/source/c920v2_opt/Kconfig b/source/c920v2_opt/Kconfig
new file mode 100644
index 00000000..2adc4d75
--- /dev/null
+++ b/source/c920v2_opt/Kconfig
@@ -0,0 +1,52 @@
+
+menu "C920V2 optimization"
+
+config C920V2_SOURCE
+	bool "SHL C920V2 optimization code"
+	default y
+	help
+		Select SHL build C920V2 optimization
+
+config C920V2_CONVOLUTION_FP32
+	depends on C920V2_SOURCE
+	bool "Layer convolution fp32"
+	default y
+	help
+		Select SHL build v extension optimized convolution
+
+config C920V2_CONVOLUTION_FP16
+	depends on C920V2_SOURCE
+	bool "Layer convolution fp16"
+	default y
+	help
+		Select SHL build v extension optimized convolution
+
+config C920V2_CONVOLUTION_INT8
+	depends on C920V2_SOURCE
+	bool "Layer convolution int8"
+	default y
+	help
+		Select SHL build v extension optimized convolution
+
+config C920V2_GEMM_FP32
+	depends on C920V2_SOURCE
+	bool "Layer GEMM fp32"
+	default y
+	help
+		Select SHL build v extension optimized gemm
+
+config C920V2_GEMM_FP16
+	depends on C920V2_SOURCE
+	bool "Layer GEMM fp16"
+	default y
+	help
+		Select SHL build v extension optimized gemm
+
+config C920V2_GEMM_INT8
+	depends on C920V2_SOURCE
+	bool "Layer GEMM fp16"
+	default y
+	help
+		Select SHL build v extension optimized gemm
+
+endmenu
diff --git a/source/c920v2_opt/capability.c b/source/c920v2_opt/capability.c
new file mode 100644
index 00000000..7fba9e20
--- /dev/null
+++ b/source/c920v2_opt/capability.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/cap.h"
+
+static int common_all_support(struct csinn_tensor *input, struct csinn_params_base *base)
+{
+    if ((input->dtype != CSINN_DTYPE_FLOAT16) && (input->dtype != CSINN_DTYPE_FLOAT32) &&
+        (input->dtype != CSINN_DTYPE_INT8)) {
+        return CSINN_OPT_UNSUPPORTED;
+    }
+
+    return CSINN_OPT_ASM;
+}
+
+int shl_c920v2_conv2d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params)
+{
+    return common_all_support(input, &(params->base));
+}
diff --git a/source/c920v2_opt/fp16/convolution.c b/source/c920v2_opt/fp16/convolution.c
new file mode 100644
index 00000000..8b6e9245
--- /dev/null
+++ b/source/c920v2_opt/fp16/convolution.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0] / params->group;
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int in_elempack = 1;
+    int out_elempack = 1;
+    struct csinn_session *sess = params->base.sess;
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_c920v2_option *option = shl_c920v2_get_graph_option(sess);
+        if (option && option->base.use_packn_layout) {
+            in_elempack = in_c % packn == 0 ? packn : 1;
+            out_elempack = out_c % packn == 0 ? packn : 1;
+        }
+        /* first layer do not convert input layout */
+        if (shl_is_first_layer_input(input, sess)) {
+            in_elempack = 1;
+        }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
+    }
+
+    bool binary_model_op_init = shl_c920v2_get_binary_model_op_init(sess);
+
+    // packn
+    if (in_elempack % packn == 0 && out_elempack % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_packn_fp16;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dilation_h == 1 && dilation_w == 1) {
+            if (params->group > 1 || (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8)) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                if (!binary_model_op_init) {
+                    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                        shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                        shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                    }
+                }
+                cb->exec = shl_c920v2_conv_im2col_gemm_packn_fp16;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+
+                // TODO: params->conv_extra.kernel_tm in binary model
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b4f3s1_packn_fp16;
+                } else {
+                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b6f3s1_packn_fp16;
+                }
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920v2_conv_im2col_gemm_packn_fp16;
+        }
+    }
+
+    // pack1ton
+    if (in_elempack % packn != 0 && out_elempack % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_pack1ton_fp16;
+        } else {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920v2_conv_im2col_gemm_pack1ton_fp16;
+        }
+    }
+
+    // packnto1
+    if (in_elempack % packn == 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_packnto1_fp16;
+        } else {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_c920v2_conv_im2col_gemm_packnto1_fp16;
+        }
+    }
+
+    // pack1
+    if (in_elempack % packn != 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_fp16;
+        } else {
+            if (!binary_model_op_init) {
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+                }
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_fp16;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c920v2_opt/fp16/convolution_1x1_fp16_pack1ton.c b/source/c920v2_opt/fp16/convolution_1x1_fp16_pack1ton.c
new file mode 100644
index 00000000..dd65a66b
--- /dev/null
+++ b/source/c920v2_opt/fp16/convolution_1x1_fp16_pack1ton.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_pack1ton_fp16(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_fp16,
+                                                     shl_c920v2_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/c920v2_opt/fp16/convolution_1x1_fp16_packn.c b/source/c920v2_opt/fp16/convolution_1x1_fp16_packn.c
new file mode 100644
index 00000000..f7f302db
--- /dev/null
+++ b/source/c920v2_opt/fp16/convolution_1x1_fp16_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packn_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp16,
+                                                  shl_c920v2_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/c920v2_opt/fp16/convolution_1x1_fp16_packnto1.c b/source/c920v2_opt/fp16/convolution_1x1_fp16_packnto1.c
new file mode 100644
index 00000000..fedbbca4
--- /dev/null
+++ b/source/c920v2_opt/fp16/convolution_1x1_fp16_packnto1.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packnto1_fp16(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_fp16,
+                                                     shl_c920v2_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/c920v2_opt/fp16/convolution_gemm_fp16_pack1ton.c b/source/c920v2_opt/fp16/convolution_gemm_fp16_pack1ton.c
new file mode 100644
index 00000000..e5a24e7d
--- /dev/null
+++ b/source/c920v2_opt/fp16/convolution_gemm_fp16_pack1ton.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_pack1ton_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_fp16,
+                                                  shl_c920v2_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/c920v2_opt/fp16/convolution_gemm_fp16_packn.c b/source/c920v2_opt/fp16/convolution_gemm_fp16_packn.c
new file mode 100644
index 00000000..36327d29
--- /dev/null
+++ b/source/c920v2_opt/fp16/convolution_gemm_fp16_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packn_fp16(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_fp16,
+                                               shl_c920v2_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/c920v2_opt/fp16/convolution_gemm_fp16_packnto1.c b/source/c920v2_opt/fp16/convolution_gemm_fp16_packnto1.c
new file mode 100644
index 00000000..7f2abe73
--- /dev/null
+++ b/source/c920v2_opt/fp16/convolution_gemm_fp16_packnto1.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packnto1_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp16,
+                                                  shl_c920v2_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/c920v2_opt/fp16/gemm_fp16_ncxhwx.S b/source/c920v2_opt/fp16/gemm_fp16_ncxhwx.S
new file mode 100644
index 00000000..67599707
--- /dev/null
+++ b/source/c920v2_opt/fp16/gemm_fp16_ncxhwx.S
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+    .file           "gemm_fp16_ncxhwx.S"
+
+#include "../../c908_opt/fp16/gemm_fp16_ncxhwx.S"
diff --git a/source/c920v2_opt/fp16/gemm_fp16_packn.c b/source/c920v2_opt/fp16/gemm_fp16_packn.c
new file mode 100644
index 00000000..fd860ec1
--- /dev/null
+++ b/source/c920v2_opt/fp16/gemm_fp16_packn.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+void gemm_fp16_ncxhwx_12xpack2n(__fp16 *output, const __fp16 *kernel, const __fp16 *input,
+                                const __fp16 *bias, int m, int k, int n, bool fuse_relu);
+void gemm_fp16_ncxhwx_12xpackn(__fp16 *output, const __fp16 *kernel, const __fp16 *input,
+                               const __fp16 *bias, int m, int k, int n, bool fuse_relu);
+
+void shl_c920v2_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                           __fp16 *bias, int m, int k, int n, bool fuse_relu)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        gemm_fp16_ncxhwx_12xpack2n(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += pack2n * k;
+        dst += pack2n * n;
+        if (bias) {
+            bias += pack2n;
+        }
+    }
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_fp16_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += packn * k;
+        dst += packn * n;
+        if (bias) {
+            bias += packn;
+        }
+    }
+    if (oc < m) {
+        gemm_fp16_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, fuse_relu);
+    }
+}
\ No newline at end of file
diff --git a/source/c920v2_opt/fp32/convolution.c b/source/c920v2_opt/fp32/convolution.c
new file mode 100644
index 00000000..8f8ae57b
--- /dev/null
+++ b/source/c920v2_opt/fp32/convolution.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0] / params->group;
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    int in_elempack = 1;
+    int out_elempack = 1;
+    struct csinn_session *sess = params->base.sess;
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_c920v2_option *option = shl_c920v2_get_graph_option(sess);
+        if (option && option->base.use_packn_layout) {
+            in_elempack = in_c % packn == 0 ? packn : 1;
+            out_elempack = out_c % packn == 0 ? packn : 1;
+        }
+        /* first layer do not convert input layout */
+        if (shl_is_first_layer_input(input, sess)) {
+            in_elempack = 1;
+        }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
+    }
+
+    bool binary_model_op_init = shl_c920v2_get_binary_model_op_init(sess);
+
+    // packn
+    if (in_elempack % packn == 0 && out_elempack % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_packn_fp32;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dilation_h == 1 && dilation_w == 1) {
+            if (params->group > 1) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                if (!binary_model_op_init) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+                }
+                cb->exec = shl_c920v2_conv_im2col_gemm_packn_fp32;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+
+                // TODO: params->conv_extra.kernel_tm in binary model
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b4f3s1_packn_fp32;
+                } else {
+                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b6f3s1_packn_fp32;
+                }
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv_im2col_gemm_packn_fp32;
+        }
+    }
+
+    // pack1ton
+    if (in_elempack % packn != 0 && out_elempack % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_pack1ton_fp32;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv_im2col_gemm_pack1ton_fp32;
+        }
+    }
+
+    // packnto1
+    if (in_elempack % packn == 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_packnto1_fp32;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv_im2col_gemm_packnto1_fp32;
+        }
+    }
+
+    // pack1
+    if (in_elempack % packn != 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_fp32;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_fp32;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c920v2_opt/fp32/convolution_1x1_fp32_pack1ton.c b/source/c920v2_opt/fp32/convolution_1x1_fp32_pack1ton.c
new file mode 100644
index 00000000..30cf0e05
--- /dev/null
+++ b/source/c920v2_opt/fp32/convolution_1x1_fp32_pack1ton.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_pack1ton_fp32(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_fp32,
+                                                     shl_c920v2_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/c920v2_opt/fp32/convolution_1x1_fp32_packn.c b/source/c920v2_opt/fp32/convolution_1x1_fp32_packn.c
new file mode 100644
index 00000000..05603e7f
--- /dev/null
+++ b/source/c920v2_opt/fp32/convolution_1x1_fp32_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packn_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp32,
+                                                  shl_c920v2_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/c920v2_opt/fp32/convolution_1x1_fp32_packnto1.c b/source/c920v2_opt/fp32/convolution_1x1_fp32_packnto1.c
new file mode 100644
index 00000000..2354fd40
--- /dev/null
+++ b/source/c920v2_opt/fp32/convolution_1x1_fp32_packnto1.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packnto1_fp32(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_fp32,
+                                                     shl_c920v2_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/c920v2_opt/fp32/convolution_gemm_fp32_pack1ton.c b/source/c920v2_opt/fp32/convolution_gemm_fp32_pack1ton.c
new file mode 100644
index 00000000..ceb1a0e2
--- /dev/null
+++ b/source/c920v2_opt/fp32/convolution_gemm_fp32_pack1ton.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_pack1ton_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_fp32,
+                                                  shl_c920v2_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/c920v2_opt/fp32/convolution_gemm_fp32_packn.c b/source/c920v2_opt/fp32/convolution_gemm_fp32_packn.c
new file mode 100644
index 00000000..4a4a4991
--- /dev/null
+++ b/source/c920v2_opt/fp32/convolution_gemm_fp32_packn.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packn_fp32(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_fp32,
+                                               shl_c920v2_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/c920v2_opt/fp32/convolution_gemm_fp32_packnto1.c b/source/c920v2_opt/fp32/convolution_gemm_fp32_packnto1.c
new file mode 100644
index 00000000..dd43093d
--- /dev/null
+++ b/source/c920v2_opt/fp32/convolution_gemm_fp32_packnto1.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input,
+                                              struct csinn_tensor *output,
+                                              struct csinn_tensor *kernel,
+                                              struct csinn_tensor *bias,
+                                              struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packnto1_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp32,
+                                                  shl_c920v2_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/c920v2_opt/fp32/gemm_fp32_ncxhwx.S b/source/c920v2_opt/fp32/gemm_fp32_ncxhwx.S
new file mode 100644
index 00000000..5cfe30f1
--- /dev/null
+++ b/source/c920v2_opt/fp32/gemm_fp32_ncxhwx.S
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+    .file           "gemm_fp32_ncxhwx.S"
+
+#include "../../c908_opt/fp32/gemm_fp32_ncxhwx.S"
diff --git a/source/c920v2_opt/fp32/gemm_fp32_packn.c b/source/c920v2_opt/fp32/gemm_fp32_packn.c
new file mode 100644
index 00000000..d8677a33
--- /dev/null
+++ b/source/c920v2_opt/fp32/gemm_fp32_packn.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+void gemm_fp32_ncxhwx_12xpack2n(float *output, const float *kernel, const float *input,
+                                const float *bias, int m, int k, int n, bool fuse_relu);
+void gemm_fp32_ncxhwx_12xpackn(float *output, const float *kernel, const float *input,
+                               const float *bias, int m, int k, int n, bool fuse_relu);
+
+void shl_c920v2_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb,
+                                           float *bias, int m, int k, int n, bool fuse_relu)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        gemm_fp32_ncxhwx_12xpack2n(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += pack2n * k;
+        dst += pack2n * n;
+        if (bias) {
+            bias += pack2n;
+        }
+    }
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_fp32_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += packn * k;
+        dst += packn * n;
+        if (bias) {
+            bias += packn;
+        }
+    }
+    if (oc < m) {
+        gemm_fp32_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, fuse_relu);
+    }
+}
diff --git a/source/c920v2_opt/int8/convolution.c b/source/c920v2_opt/int8/convolution.c
new file mode 100644
index 00000000..5b39cf9d
--- /dev/null
+++ b/source/c920v2_opt/int8/convolution.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0] / params->group;
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int in_elempack = 1;
+    int out_elempack = 1;
+    struct csinn_session *sess = params->base.sess;
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_c920v2_option *option = shl_c920v2_get_graph_option(sess);
+        if (option && option->base.use_packn_layout) {
+            in_elempack = in_c % packn == 0 ? packn : 1;
+            out_elempack = out_c % packn == 0 ? packn : 1;
+        }
+        /* first layer do not convert input layout */
+        if (shl_is_first_layer_input(input, sess)) {
+            in_elempack = 1;
+        }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
+    }
+
+    bool binary_model_op_init = shl_c920v2_get_binary_model_op_init(sess);
+
+    // packn
+    if (in_elempack % packn == 0 && out_elempack % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_packn_int8;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dilation_h == 1 && dilation_w == 1) {
+            if (params->group > 1) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+                if (!binary_model_op_init) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+                }
+                cb->exec = shl_rvv_conv_im2col_gemm_packn_int8;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel);
+                cb->exec = shl_rvv_wg_b4f3s1_packn_int8;
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_packn_int8;
+        }
+    }
+
+    // pack1ton
+    if (in_elempack % packn != 0 && out_elempack % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_pack1ton_int8;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_int8;
+        }
+    }
+
+    // packnto1
+    if (in_elempack % packn == 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params);
+            }
+            cb->exec = shl_c920v2_conv1x1s1_gemm_packnto1_int8;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_int8;
+        }
+    }
+
+    // pack1
+    if (in_elempack % packn != 0 && out_elempack % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
+            if (!binary_model_op_init) {
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(kernel, params);
+            }
+            cb->exec = shl_rvv_conv1x1s1_gemm_int8;
+        } else {
+            if (!binary_model_op_init) {
+                shl_rvv_conv_im2col_gemm_reorder_kernel_int8(kernel, params);
+            }
+            cb->exec = shl_rvv_conv_im2col_gemm_int8;
+        }
+    }
+
+    // support channel quantization
+    for (int i = 0; i < kernel->quant_channel; i++) {
+        float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+        // trick for winograd b4f3
+        if (params->conv_extra.conv_mode == CSINN_WINOGRAD) {
+            real_scale = real_scale / 576.0f;
+        }
+        shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                &(kernel->qinfo[i].shift));
+    }
+
+    // enable fuse zeropoint to bias for gemm
+    if (params->conv_extra.conv_mode == CSINN_GEMM) {
+        if (!params->conv_extra.fuse_zp2bias) {
+            params->conv_extra.fuse_zp2bias = true;
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *kernel_data = (int8_t *)kernel->data;
+            int32_t input_zp = input->qinfo->zero_point;
+
+            if (bias_data == NULL) {
+                // XXX: memory leak
+                bias_data = (int32_t *)shl_mem_alloc(out_c * params->group * sizeof(int32_t));
+                bias->data = bias_data;
+            }
+            int kernel_inner = in_c * kernel_h * kernel_w;
+            for (int oc = 0; oc < out_c * params->group; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < kernel_inner; j++) {
+                    tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+                }
+                bias_data[oc] -= tmp;
+            }
+        }
+    }
+
+    // recover fuse zeropoint to bias for winograd
+    if (params->conv_extra.conv_mode == CSINN_WINOGRAD) {
+        if (params->conv_extra.fuse_zp2bias) {
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *kernel_data = (int8_t *)kernel->data;
+            int32_t input_zp = input->qinfo->zero_point;
+
+            int kernel_inner = in_c * kernel_h * kernel_w;
+            for (int oc = 0; oc < out_c * params->group; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < kernel_inner; j++) {
+                    tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+                }
+                bias_data[oc] += tmp;
+            }
+        }
+    }
+
+    return CSINN_TRUE;
+}
diff --git a/source/c920v2_opt/int8/convolution_1x1_int8_pack1ton.c b/source/c920v2_opt/int8/convolution_1x1_int8_pack1ton.c
new file mode 100644
index 00000000..4e1cda27
--- /dev/null
+++ b/source/c920v2_opt/int8/convolution_1x1_int8_pack1ton.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv1x1_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_int8_dot,
+                                                     shl_c920v2_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv1x1_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z4_pack1ton_int8,
+                                                     shl_c920v2_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/c920v2_opt/int8/convolution_1x1_int8_packn.c b/source/c920v2_opt/int8/convolution_1x1_int8_packn.c
new file mode 100644
index 00000000..b335d709
--- /dev/null
+++ b/source/c920v2_opt/int8/convolution_1x1_int8_packn.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv1x1_gemm_packn_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                  shl_c920v2_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv1x1_gemm_packn_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z4_packn_int8,
+                                                  shl_c920v2_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/c920v2_opt/int8/convolution_1x1_int8_packnto1.c b/source/c920v2_opt/int8/convolution_1x1_int8_packnto1.c
new file mode 100644
index 00000000..0ed6e4f3
--- /dev/null
+++ b/source/c920v2_opt/int8/convolution_1x1_int8_packnto1.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+int shl_c920v2_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv1x1_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                     shl_c920v2_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv1x1_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z4_packn_int8,
+                                                     shl_c920v2_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/c920v2_opt/int8/gemm_int8_dot_ncxhwx.S b/source/c920v2_opt/int8/gemm_int8_dot_ncxhwx.S
new file mode 100644
index 00000000..959d394b
--- /dev/null
+++ b/source/c920v2_opt/int8/gemm_int8_dot_ncxhwx.S
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+    .file           "gemm_int8_dot_ncxhwx.S"
+
+#include "../../c908_opt/int8/gemm_int8_dot_ncxhwx.S"
diff --git a/source/c920v2_opt/int8/gemm_int8_dot_packn.c b/source/c920v2_opt/int8/gemm_int8_dot_packn.c
new file mode 100644
index 00000000..3f3161c8
--- /dev/null
+++ b/source/c920v2_opt/int8/gemm_int8_dot_packn.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+void gemm_int8_dot_ncxhwx_12xpackn(int8_t *output, const int8_t *kernel, const int8_t *input,
+                                   const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                   int32_t *mult, int32_t *shift);
+void gemm_int8_dot_ncxhwx_8xpackn(int8_t *output, const int8_t *kernel, const int8_t *input,
+                                  const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                  int32_t *mult, int32_t *shift);
+
+void shl_c920v2_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                              int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                              int32_t *mult, int32_t *shift)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    int oc = 0;
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_int8_dot_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, out_zp, mult + oc,
+                                      shift + oc);
+        sa += packn * k;
+        dst += packn * n;
+        // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+        bias += packn;
+    }
+    if (oc < m) {
+        gemm_int8_dot_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, out_zp, mult + oc,
+                                      shift + oc);
+    }
+}
diff --git a/source/c920v2_opt/int8/gemm_int8_ncxhwx.S b/source/c920v2_opt/int8/gemm_int8_ncxhwx.S
new file mode 100644
index 00000000..d62e1506
--- /dev/null
+++ b/source/c920v2_opt/int8/gemm_int8_ncxhwx.S
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+    .file           "gemm_int8_ncxhwx.S"
+
+#include "../../c908_opt/int8/gemm_int8_ncxhwx.S"
diff --git a/source/c920v2_opt/int8/gemm_int8_packn.c b/source/c920v2_opt/int8/gemm_int8_packn.c
new file mode 100644
index 00000000..85f25174
--- /dev/null
+++ b/source/c920v2_opt/int8/gemm_int8_packn.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+void gemm_int8_ncxhwx_4xpack2n(int8_t *output, const int8_t *kernel, const int8_t *input,
+                               const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                               int32_t *mult, int32_t *shift);
+void gemm_int8_ncxhwx_4xpackn(int8_t *output, const int8_t *kernel, const int8_t *input,
+                              const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                              int32_t *mult, int32_t *shift);
+
+void shl_c920v2_ncxhwx_gemm_4xpack2n_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                          int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                          int32_t *mult, int32_t *shift)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        gemm_int8_ncxhwx_4xpack2n(dst, sa, sb, bias, pack2n, k, n, out_zp, mult + oc, shift + oc);
+        sa += pack2n * k;
+        dst += pack2n * n;
+        // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+        bias += pack2n;
+    }
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_int8_ncxhwx_4xpackn(dst, sa, sb, bias, packn, k, n, out_zp, mult + oc, shift + oc);
+        sa += packn * k;
+        dst += packn * n;
+        bias += packn;
+    }
+    if (oc < m) {
+        gemm_int8_ncxhwx_4xpackn(dst, sa, sb, bias, m - oc, k, n, out_zp, mult + oc, shift + oc);
+    }
+}
diff --git a/source/c920v2_opt/setup.c b/source/c920v2_opt/setup.c
new file mode 100644
index 00000000..6f65bf78
--- /dev/null
+++ b/source/c920v2_opt/setup.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+#include "c920v2/cap.h"
+
+#define C920V2_OP_PATTERN_MAX 40
+static struct shl_cb_table shl_c920v2_cb_table[C920V2_OP_PATTERN_MAX];
+
+void shl_c920v2_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init,
+                       void *exec, void *est, void *cap)
+{
+    static int i = 0;
+    shl_c920v2_cb_table[i].shl_cb_key = op_name * CSINN_DTYPE_SIZE + dtype;
+    shl_c920v2_cb_table[i].shl_cb_value.init = init;
+    shl_c920v2_cb_table[i].shl_cb_value.exec = exec;
+    shl_c920v2_cb_table[i].shl_cb_value.est = est;
+    shl_c920v2_cb_table[i].shl_cb_value.caps = cap;
+    i++;
+}
+
+struct csinn_callback *shl_cb_map_rvv(int op, int dtype);
+struct csinn_callback *shl_cb_map_c920v2(int op, int dtype)
+{
+    struct csinn_callback *cb = NULL;
+    for (int i = 0; i < C920V2_OP_PATTERN_MAX; i++) {
+        if (shl_c920v2_cb_table[i].shl_cb_key == (op * CSINN_DTYPE_SIZE + dtype)) {
+            cb = &(shl_c920v2_cb_table[i].shl_cb_value);
+            break;
+        }
+    }
+    if ((cb == NULL) || (cb->est == NULL && (cb->init == NULL || cb->exec == NULL))) {
+        cb = shl_cb_map_rvv(op, dtype);
+    }
+    return cb;
+}
+
+int shl_c920v2_set_packn_layout(struct csinn_session *sess, bool packn_layout)
+{
+    struct shl_gref_target_data *gref_td = sess->td;
+    struct shl_c920v2_option *c920v2_option = gref_td->cpu_option;
+    c920v2_option->base.use_packn_layout = packn_layout;
+    return CSINN_TRUE;
+}
+
+struct shl_c920v2_option *shl_c920v2_get_graph_option(struct csinn_session *sess)
+{
+    struct shl_gref_target_data *gref_td = sess->td;
+    if (gref_td) {
+        return (struct shl_c920v2_option *)(gref_td->cpu_option);
+    } else {
+        return NULL;
+    }
+}
+
+void shl_c920v2_session_init(struct csinn_session *sess)
+{
+    struct shl_c920v2_option *c920v2_option = shl_mem_alloc(sizeof(struct shl_c920v2_option));
+    struct shl_ref_graph *graph = shl_mem_alloc(sizeof(struct shl_ref_graph));
+    struct shl_gref_target_data *target_data = shl_mem_alloc(sizeof(struct shl_gref_target_data));
+    target_data->graph = graph;
+    c920v2_option->base.use_packn_layout = 1;  // c920 set use_packn_layout true default
+    target_data->cpu_option = c920v2_option;
+    sess->td = target_data;
+    shl_c920v2_set_binary_model_op_init(sess, false);
+    sess->base_layout = CSINN_LAYOUT_NCHW;
+}
+
+void shl_c920v2_session_deinit(struct csinn_session *sess)
+{
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    shl_mem_free(graph->input);
+    shl_mem_free(graph->output);
+    struct shl_c920v2_option *c920v2_option = shl_c920v2_get_graph_option(sess);
+    if (c920v2_option) {
+        shl_mem_free(c920v2_option);
+    }
+}
+
+static int pre_init(struct shl_node *node)
+{
+    /* base has same address with params */
+    struct csinn_params_base *params = node->data;
+
+    int (*func)();
+
+    int org_rm = params->sess->base_run_mode;
+    params->sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_callback *cb = shl_gref_best_callback(node);
+
+    params->sess->base_run_mode = org_rm;
+
+    return CSINN_TRUE;
+}
+
+static int init_op(struct shl_node *node)
+{
+    /* base has same address with params */
+    struct csinn_params_base *params = node->data;
+    struct csinn_callback *cb = params->cb;
+
+    if (cb->init != NULL) {
+        if (shl_gref_call_layer_func(cb->init, node) != CSINN_TRUE) {
+            return CSINN_FALSE;
+        }
+    }
+
+    return CSINN_TRUE;
+}
+
+static void sess_op_init(struct csinn_session *sess)
+{
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+
+    // pre init, find best callback
+    for (int i = 0; i < graph->layer_index; i++) {
+        struct shl_node *n = graph->layer[i];
+        if (n->type >= 0 && n->type < CSINN_OP_SIZE) {
+            pre_init(n);
+        } else {
+            shl_debug_error("Unknown layer\n");
+            return;
+        }
+    }
+
+    // different layout
+    bool use_packn = true;
+    for (int i = 0; i < graph->layer_index; i++) {
+        struct csinn_params_base *curr_params = graph->layer[i]->data;
+        if (curr_params->api == CSINN_TVMGEN) {
+            use_packn = false;
+            break;
+        }
+    }
+    shl_c920v2_set_packn_layout(sess, use_packn);
+
+    // call init
+    for (int i = 0; i < graph->layer_index; i++) {
+        struct shl_node *n = graph->layer[i];
+        if (n->type >= 0 && n->type < CSINN_OP_SIZE) {
+            init_op(n);
+        } else {
+            shl_debug_error("Unknown layer\n");
+            return;
+        }
+    }
+}
+
+void shl_c920v2_session_setup(struct csinn_session *sess)
+{
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    struct shl_node *n;
+    FILE *b;
+    char *path;
+    int bm_offset = 8192;
+    struct shl_binary_model_section_info *sinfo;
+    bool save_binary_model = false;
+
+    if (sess->model.save_mode == CSINN_SAVE_AND_RUN || sess->model.save_mode == CSINN_SAVE_ONLY) {
+        if (sess->base_dtype == CSINN_DTYPE_FLOAT16 || sess->base_dtype == CSINN_DTYPE_FLOAT32) {
+            save_binary_model = true;
+        } else {
+            shl_debug_warning("Unsupport to save this dtype binary model yet\n");
+        }
+    }
+
+    struct shl_ref_graph *ggraph = graph;
+
+    sess_op_init(sess);
+
+    for (int i = 0; i < ggraph->layer_index; i++) {
+        n = ggraph->layer[i];
+        for (int j = 0; j < n->in_num; j++) {
+            if (n->in[j]->ref_count_init > 0) {
+                n->in[j]->ref_count_init++;
+            }
+        }
+        if (n->type != CSINN_SUBGRAPH) {
+            for (int k = 0; k < n->out_num; k++) {
+                n->out[k]->ref_count_init++;
+            }
+        }
+    }
+
+    for (int i = 0; i < ggraph->output_num; i++) {
+        ggraph->output[i]->ref_count_init++;
+    }
+
+    if (save_binary_model) {
+        if (sess->model.bm_path == NULL) {
+            path = "shl.hhb.bm";
+        } else {
+            path = sess->model.bm_path;
+        }
+        b = fopen(path, "wb");
+        shl_dump_bm_header(b);
+
+        /* TODO: start from more */
+        bm_offset = 8192;
+        fseek(b, bm_offset, SEEK_SET);
+        sinfo = shl_mem_alloc(sizeof(struct shl_binary_model_section_info));
+
+        /* only dump top(global) graph, unsupport subgraph */
+        fseek(b, bm_offset, SEEK_SET);
+        int ggraph_size = shl_dump_bm_graph_struct_section(b, ggraph);
+        sinfo->sections[0].graph_offset = bm_offset / 4096;
+        sinfo->sections[0].graph_size = ggraph_size;
+        bm_offset = shl_gref_size_align(bm_offset + ggraph_size, 4096);
+
+        fseek(b, bm_offset, SEEK_SET);
+        int info_size = shl_dump_bm_graph_info_section(b, sess);
+        sinfo->sections[0].info_offset = bm_offset / 4096;
+        sinfo->sections[0].info_size = info_size;
+        bm_offset = shl_gref_size_align(bm_offset + info_size, 4096);
+
+        /* save section info */
+        sinfo->section_num = 2;
+        fseek(b, 4096, SEEK_SET);
+        shl_dump_bm_section_info(b, sinfo);
+        fclose(b);
+    }
+}
+
+/* use tensor name to match same */
+static void merge_output(struct shl_ref_graph *graph, struct csinn_session *sess)
+{
+    /* match graph output */
+    for (int i = 0; i < graph->output_num; i++) {
+        struct shl_node *gnode = graph->output[i];
+        char *sname = gnode->name;
+        for (int j = 0; j < graph->layer_index; j++) {
+            struct shl_node *node = graph->layer[j];
+            for (int m = 0; m < node->out_num; m++) {
+                if (strcmp(node->name, sname) == 0) {
+                    /* TODO: free graph output node */
+                    graph->output[i] = node->out[m];
+                    break;
+                }
+            }
+        }
+    }
+
+    for (int i = 0; i < sess->input_num; i++) {
+        /* TODO: free sess output node */
+        sess->input[i] = graph->input[i]->data;
+    }
+
+    for (int i = 0; i < sess->output_num; i++) {
+        /* TODO: free sess output node */
+        sess->output[i] = graph->output[i]->data;
+    }
+}
+
+static void graph_match_session(struct shl_ref_graph *graph, struct csinn_session *sess)
+{
+    struct shl_gref_target_data *td = sess->td;
+    td->graph = graph;
+
+    for (int i = 0; i < graph->layer_index; i++) {
+        struct shl_node *n = graph->layer[i];
+        /* fix op callback, skip subgraph */
+        if (n->type < CSINN_OP_SIZE) {
+            struct csinn_params_base *base = n->data;
+            base->sess = sess;
+            struct csinn_tensor *input = n->in[0]->data;
+
+            int org_rm = base->sess->base_run_mode;
+            base->sess->base_run_mode = CSINN_RM_LAYER;
+            shl_op_callback_map(base, n->type, input->dtype);
+            base->sess->base_run_mode = org_rm;
+        }
+    }
+}
+
+int shl_c920v2_load_binary_model(struct csinn_session *sess)
+{
+    char *bm_base = sess->model.bm_addr;
+    struct shl_binary_model_section_info *sinfo =
+        (struct shl_binary_model_section_info *)(bm_base + 4096);
+    struct shl_ref_graph *ggraph = shl_mem_alloc(sizeof(struct shl_ref_graph));
+    shl_bm_graph_struct_load(
+        ggraph, (struct shl_ref_graph *)(bm_base + sinfo->sections[0].graph_offset * 4096));
+    graph_match_session(ggraph, sess);
+    merge_output(ggraph, sess);
+    shl_c920v2_set_binary_model_op_init(sess, true);
+    sess_op_init(sess);
+
+    return CSINN_TRUE;
+}
+
+int shl_c920v2_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess)
+{
+    struct csinn_tensor *sess_output = sess->output[index];
+    struct shl_c920v2_option *option = shl_c920v2_get_graph_option(sess);
+    if (option && option->base.use_packn_layout) {
+        if (output->layout == CSINN_LAYOUT_NC1DHWC0 || output->layout == CSINN_LAYOUT_NC1HWC0 ||
+            output->layout == CSINN_LAYOUT_NC1WC0 || output->layout == CSINN_LAYOUT_NC1C0) {
+            if (output->dtype == CSINN_DTYPE_FLOAT32) {
+                shl_rvv_tensor_nc1xc0_to_ndarray_inplace_fp32(output);
+            } else if (output->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_rvv_tensor_nc1xc0_to_ndarray_inplace_fp16(output);
+            } else if (output->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_tensor_nc1xc0_to_ndarray_inplace_int8(output);
+            } else {
+                shl_debug_error("c920 get output unsupported dtype: %d\n", output->dtype);
+                return CSINN_UNSUPPORT_DTYPE;
+            }
+
+            /* TODO: unset sess_output, alloc another data space and copy to output */
+            sess_output->dim[1] =
+                sess_output->dim[1] * sess_output->dim[sess_output->dim_count - 1];
+            sess_output->dim[sess_output->dim_count - 1] = 0;
+            sess_output->dim_count = sess_output->dim_count - 1;
+            if (sess_output->layout == CSINN_LAYOUT_NC1DHWC0) {
+                sess_output->layout = CSINN_LAYOUT_NCDHW;
+            } else if (sess_output->layout == CSINN_LAYOUT_NC1HWC0) {
+                sess_output->layout = CSINN_LAYOUT_NCHW;
+            } else if (sess_output->layout == CSINN_LAYOUT_NC1WC0) {
+                sess_output->layout = CSINN_LAYOUT_NCW;
+            } else if (sess_output->layout == CSINN_LAYOUT_NC1C0) {
+                sess_output->layout = CSINN_LAYOUT_NC;
+            }
+        }
+    }
+
+    return CSINN_TRUE;
+}
+
+void *shl_c920v2_runtime_callback(int api)
+{
+    switch (api) {
+        case CSINN_SESSION_INIT:
+            return shl_c920v2_session_init;
+            break;
+        case CSINN_SESSION_DEINIT:
+            return shl_c920v2_session_deinit;
+            break;
+        case CSINN_SESSION_SETUP:
+            return shl_c920v2_session_setup;
+            break;
+        case CSINN_LOAD_BG:
+            return shl_c920v2_load_binary_model;
+            break;
+        case CSINN_GET_OUTPUT:
+            return shl_c920v2_get_output;
+            break;
+        case CSINN_SESSION_RUN:
+        case CSINN_UPDATE_INPUT:
+        case CSINN_UPDATE_OUTPUT:
+        case CSINN_SET_INPUT_NUMBER:
+        case CSINN_SET_OUTPUT_NUMBER:
+        case CSINN_SET_INPUT:
+        case CSINN_SET_OUTPUT:
+        case CSINN_GET_INPUT:
+        case CSINN_TENSOR_ENTRY:
+            return shl_gref_runtime_callback(api);
+            break;
+        default:
+            shl_debug_info("%s: Cannot find callback\n", __func__);
+            break;
+    }
+    return NULL;
+}
+
+void shl_target_init_c920v2()
+{
+#ifndef CONFIG_C920V2_CONVOLUTION_FP32_DISABLED
+    shl_c920v2_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c920v2_conv2d_init_fp32, NULL,
+                      shl_gref_conv2d, shl_c920v2_conv2d_cap);
+    shl_c920v2_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c920v2_conv2d_init_fp32, NULL,
+                      shl_gref_group_conv2d, shl_c920v2_conv2d_cap);
+#endif
+#ifndef CONFIG_C920V2_CONVOLUTION_FP16_DISABLED
+    shl_c920v2_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c920v2_conv2d_init_fp16, NULL,
+                      shl_gref_conv2d, shl_c920v2_conv2d_cap);
+    shl_c920v2_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c920v2_conv2d_init_fp16, NULL,
+                      shl_gref_group_conv2d, shl_c920v2_conv2d_cap);
+#endif
+#ifndef CONFIG_C920V2_CONVOLUTION_INT8_DISABLED
+    shl_c920v2_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_c920v2_conv2d_init_int8, NULL,
+                      shl_gref_conv2d, shl_c920v2_conv2d_cap);
+    shl_c920v2_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D, shl_c920v2_conv2d_init_int8, NULL,
+                      shl_gref_group_conv2d, shl_c920v2_conv2d_cap);
+#endif
+    shl_register_op_callback(CSINN_C920V2, shl_cb_map_c920v2);
+    shl_register_runtime_callback(CSINN_C920V2, shl_c920v2_runtime_callback);
+}
diff --git a/source/c920v2_opt/utils.c b/source/c920v2_opt/utils.c
new file mode 100644
index 00000000..54cfa20a
--- /dev/null
+++ b/source/c920v2_opt/utils.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c920v2/c920v2.h"
+
+bool shl_c920v2_get_binary_model_op_init(struct csinn_session *sess)
+{
+    struct shl_c920v2_option *option = shl_c920v2_get_graph_option(sess);
+    if (option && option->base.binary_model_op_init) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+void shl_c920v2_set_binary_model_op_init(struct csinn_session *sess, bool value)
+{
+    struct shl_c920v2_option *option = shl_c920v2_get_graph_option(sess);
+    option->base.binary_model_op_init = value;
+}
diff --git a/source/e907_opt/concat.c b/source/e907_opt/concat.c
index 91c11341..94bbbbb5 100644
--- a/source/e907_opt/concat.c
+++ b/source/e907_opt/concat.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 int shl_e907_concat_int8(struct csinn_tensor **input, struct csinn_tensor *output,
                          struct csinn_concat_params *params)
diff --git a/source/e907_opt/convolution.c b/source/e907_opt/convolution.c
index a1ec792d..c398d680 100644
--- a/source/e907_opt/convolution.c
+++ b/source/e907_opt/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 int shl_e907_conv2d_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/e907_opt/fullyconnected.c b/source/e907_opt/fullyconnected.c
index 7b795a38..1e39322d 100644
--- a/source/e907_opt/fullyconnected.c
+++ b/source/e907_opt/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 int shl_e907_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/e907_opt/fullyconnected_int8.c b/source/e907_opt/fullyconnected_int8.c
index 0c65a1dd..99c43e40 100644
--- a/source/e907_opt/fullyconnected_int8.c
+++ b/source/e907_opt/fullyconnected_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 static void shl_e907_fullyconnectd_int8_internel(const int8_t *input, int32_t *output,
                                                  int8_t *weight, const int32_t *bias, int in_nodes,
diff --git a/source/e907_opt/mul.c b/source/e907_opt/mul.c
index b41221f1..bc0a7914 100644
--- a/source/e907_opt/mul.c
+++ b/source/e907_opt/mul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 /************************************************************************************
  * s3*(q3-z3) = s1*(q1-z1) * s2*(q2-z2)
diff --git a/source/e907_opt/relu.c b/source/e907_opt/relu.c
index 4e45a08e..9149c86b 100644
--- a/source/e907_opt/relu.c
+++ b/source/e907_opt/relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 /************************************************************************************
  * s2(q2 - z2) = relu{ s1(q1 - z1) }
diff --git a/source/e907_opt/setup.c b/source/e907_opt/setup.c
index fe248189..0af29d74 100644
--- a/source/e907_opt/setup.c
+++ b/source/e907_opt/setup.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 static struct shl_cb_op_list shl_e907_cb_op_list;
 
diff --git a/source/e907_opt/softmax.c b/source/e907_opt/softmax.c
index 2e4002e2..e0c43241 100644
--- a/source/e907_opt/softmax.c
+++ b/source/e907_opt/softmax.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 static inline float fast_exp(float y)
 {
diff --git a/source/e907_opt/sum.c b/source/e907_opt/sum.c
index 1650f4a3..4cfaf6a5 100644
--- a/source/e907_opt/sum.c
+++ b/source/e907_opt/sum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 int shl_e907_sum_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_reduce_params *params)
diff --git a/source/e907_opt/utils.c b/source/e907_opt/utils.c
index 947082d7..97ea4778 100644
--- a/source/e907_opt/utils.c
+++ b/source/e907_opt/utils.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_e907.h"
+#include "e907/e907.h"
 
 int shl_rvp_get_xlenb()
 {
diff --git a/source/graph_ref/avgpool3d.c b/source/graph_ref/avgpool3d.c
index 76451709..b0aedcc8 100644
--- a/source/graph_ref/avgpool3d.c
+++ b/source/graph_ref/avgpool3d.c
@@ -41,19 +41,27 @@ int shl_gref_global_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *o
 int shl_gref_global_avgpool3d_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                           struct csinn_pool_params *params)
 {
-    int d, h, w;
-    if (output->layout == CSINN_LAYOUT_NCDHW) {
+    int n, c, d, h, w;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCDHW) {
+        n = 0;
+        c = 1;
         d = 2;
         h = 3;
         w = 4;
-    } else if (output->layout == CSINN_LAYOUT_NDHWC) {
+    } else if (input->layout == CSINN_LAYOUT_NDHWC) {
+        n = 0;
         d = 1;
         h = 2;
         w = 3;
+        c = 4;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
-
+    output->dim_count = 5;
+    output->dim[n] = input->dim[n];
+    output->dim[c] = input->dim[c];
     output->dim[d] = 1;
     output->dim[h] = 1;
     output->dim[w] = 1;
diff --git a/source/graph_ref/batch_to_space.c b/source/graph_ref/batch_to_space.c
index 56b7c6ac..51d9bd77 100644
--- a/source/graph_ref/batch_to_space.c
+++ b/source/graph_ref/batch_to_space.c
@@ -30,15 +30,17 @@ int shl_gref_batch_to_space_infer_shape(struct csinn_tensor *input, struct csinn
                                         struct csinn_batch_to_space_params *params)
 {
     int h, w, c;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
         w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
+    } else if (input->layout == CSINN_LAYOUT_NHWC) {
         h = 1;
         w = 2;
         c = 3;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
diff --git a/source/graph_ref/batch_to_space_nd.c b/source/graph_ref/batch_to_space_nd.c
index d5ee89ba..60801c47 100644
--- a/source/graph_ref/batch_to_space_nd.c
+++ b/source/graph_ref/batch_to_space_nd.c
@@ -29,6 +29,7 @@ int shl_gref_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *
 int shl_gref_batch_to_space_nd_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                            struct csinn_batch_to_space_nd_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     int32_t block_size = 1;
     for (int i = 0; i < params->spatial_dim_cnt; i++) {
         block_size *= params->block_shape[i];
diff --git a/source/graph_ref/concat.c b/source/graph_ref/concat.c
index a80701bf..230feafa 100644
--- a/source/graph_ref/concat.c
+++ b/source/graph_ref/concat.c
@@ -46,9 +46,11 @@ int shl_gref_concat(struct csinn_tensor **input, struct csinn_tensor *output,
 int shl_gref_concat_infer_shape(struct csinn_tensor **input, struct csinn_tensor *output,
                                 struct csinn_concat_params *params)
 {
-    for (int i = 1; i < params->inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
+        shl_tensor_try_nc1xc0_to_ndarray_shape(input[i]);
         if (input[i]->dim_count != input[0]->dim_count) {
             shl_debug_error("all inputs must have same shape size!\n");
+            return CSINN_FALSE;
         }
     }
     output->dim_count = input[0]->dim_count;
diff --git a/source/graph_ref/convolution.c b/source/graph_ref/convolution.c
index 56ee22df..df361f14 100644
--- a/source/graph_ref/convolution.c
+++ b/source/graph_ref/convolution.c
@@ -30,16 +30,20 @@ int shl_gref_conv2d_infer_shape(struct csinn_tensor *input, struct csinn_tensor
                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                 struct csinn_conv2d_params *params)
 {
-    int c, h, w;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
+    int c, h, w, kernel_oc;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
         w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
+        kernel_oc = kernel->dim[0];
+    } else if (input->layout == CSINN_LAYOUT_NHWC) {
         h = 1;
         w = 2;
         c = 3;
+        kernel_oc = params->group == input->dim[c] ? kernel->dim[3] : kernel->dim[0];
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
@@ -54,9 +58,10 @@ int shl_gref_conv2d_infer_shape(struct csinn_tensor *input, struct csinn_tensor
     int32_t dilation_h = params->dilation_height;
     int32_t dilation_w = params->dilation_width;
 
-    output->dim_count = kernel->dim_count;
+    output->layout = input->layout;
+    output->dim_count = 4;
     output->dim[0] = input->dim[0];  // N
-    output->dim[c] = kernel->dim[0];
+    output->dim[c] = kernel_oc;
     output->dim[h] = (in_h + padding_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
     output->dim[w] = (in_w + padding_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
     return CSINN_TRUE;
diff --git a/source/graph_ref/convolution1d.c b/source/graph_ref/convolution1d.c
index cb2eca64..8dd0cd2b 100644
--- a/source/graph_ref/convolution1d.c
+++ b/source/graph_ref/convolution1d.c
@@ -30,16 +30,18 @@ int shl_gref_conv1d_infer_shape(struct csinn_tensor *input, struct csinn_tensor
                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                 struct csinn_conv1d_params *params)
 {
-    int c, w, kernel_c;
-    if (output->layout == CSINN_LAYOUT_NCW) {
+    int c, w, kernel_oc;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCW) {
         c = 1;
         w = 2;
-        kernel_c = kernel->dim[0];
-    } else if (output->layout == CSINN_LAYOUT_NWC) {
+        kernel_oc = kernel->dim[0];
+    } else if (input->layout == CSINN_LAYOUT_NWC) {
         w = 1;
         c = 2;
-        kernel_c = kernel->dim[2];
+        kernel_oc = params->group == input->dim[c] ? kernel->dim[2] : kernel->dim[0];
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
@@ -47,12 +49,13 @@ int shl_gref_conv1d_infer_shape(struct csinn_tensor *input, struct csinn_tensor
     int32_t kernel_w = kernel->dim[w];
     int32_t padding_w = params->pad_left + params->pad_right;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_w = params->dilation_width;
 
-    output->dim_count = input->dim_count;
+    output->layout = input->layout;
+    output->dim_count = 3;
     output->dim[0] = input->dim[0];  // N
-    output->dim[c] = kernel_c;
-    output->dim[w] = (in_w + padding_w - dalition_w * (kernel_w - 1) - 1) / stride_w + 1;
+    output->dim[c] = kernel_oc;
+    output->dim[w] = (in_w + padding_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
     return CSINN_TRUE;
 }
 
diff --git a/source/graph_ref/convolution3d.c b/source/graph_ref/convolution3d.c
index db50149f..01ca1d37 100644
--- a/source/graph_ref/convolution3d.c
+++ b/source/graph_ref/convolution3d.c
@@ -30,16 +30,22 @@ int shl_gref_conv3d_infer_shape(struct csinn_tensor *input, struct csinn_tensor
                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                 struct csinn_conv3d_params *params)
 {
-    int d, h, w;
-    if (output->layout == CSINN_LAYOUT_NCDHW) {
+    int c, d, h, w, kernel_oc;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCDHW) {
+        c = 1;
         d = 2;
         h = 3;
         w = 4;
-    } else if (output->layout == CSINN_LAYOUT_NDHWC) {
+        kernel_oc = kernel->dim[0];
+    } else if (input->layout == CSINN_LAYOUT_NDHWC) {
         d = 1;
         h = 2;
         w = 3;
+        c = 4;
+        kernel_oc = params->group == input->dim[c] ? kernel->dim[4] : kernel->dim[0];
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
@@ -59,7 +65,10 @@ int shl_gref_conv3d_infer_shape(struct csinn_tensor *input, struct csinn_tensor
     int32_t dilation_h = params->dilation_height;
     int32_t dilation_w = params->dilation_width;
 
-    output->dim_count = input->dim_count;
+    output->layout = input->layout;
+    output->dim_count = 5;
+    output->dim[0] = input->dim[0];
+    output->dim[c] = kernel_oc;
     output->dim[d] = (in_d + padding_d - dilation_d * (kernel_d - 1) - 1) / stride_d + 1;
     output->dim[h] = (in_h + padding_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
     output->dim[w] = (in_w + padding_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
diff --git a/source/graph_ref/deconvolution.c b/source/graph_ref/deconvolution.c
index 5a0bd0bb..40655d6f 100644
--- a/source/graph_ref/deconvolution.c
+++ b/source/graph_ref/deconvolution.c
@@ -30,14 +30,20 @@ int shl_gref_deconv2d_infer_shape(struct csinn_tensor *input, struct csinn_tenso
                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                   struct csinn_conv2d_params *params)
 {
-    int h, w;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
+    int c, h, w, kernel_oc;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCHW) {
+        c = 1;
         h = 2;
         w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
+        kernel_oc = params->group == input->dim[c] ? kernel->dim[0] : kernel->dim[1];
+    } else if (input->layout == CSINN_LAYOUT_NHWC) {
         h = 1;
         w = 2;
+        c = 3;
+        kernel_oc = params->group == input->dim[c] ? kernel->dim[3] : kernel->dim[0];
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
@@ -52,7 +58,9 @@ int shl_gref_deconv2d_infer_shape(struct csinn_tensor *input, struct csinn_tenso
     int32_t dilation_h = params->dilation_height;
     int32_t dilation_w = params->dilation_width;
 
-    output->dim_count = input->dim_count;
+    output->dim_count = 4;
+    output->dim[0] = input->dim[0];
+    output->dim[c] = kernel_oc;
     output->dim[h] = (in_h - 1) * stride_h - padding_h + dilation_h * (kernel_h - 1) + 1;
     output->dim[w] = (in_w - 1) * stride_w - padding_w + dilation_w * (kernel_w - 1) + 1;
 
diff --git a/source/graph_ref/deconvolution3d.c b/source/graph_ref/deconvolution3d.c
index ae0075c6..d0951775 100644
--- a/source/graph_ref/deconvolution3d.c
+++ b/source/graph_ref/deconvolution3d.c
@@ -30,16 +30,22 @@ int shl_gref_deconv3d_infer_shape(struct csinn_tensor *input, struct csinn_tenso
                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                   struct csinn_conv3d_params *params)
 {
-    int d, h, w;
+    int c, d, h, w, kernel_oc;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     if (output->layout == CSINN_LAYOUT_NCDHW) {
+        c = 1;
         d = 2;
         h = 3;
         w = 4;
+        kernel_oc = params->group == input->dim[c] ? kernel->dim[0] : kernel->dim[1];
     } else if (output->layout == CSINN_LAYOUT_NDHWC) {
         d = 1;
         h = 2;
         w = 3;
+        c = 4;
+        kernel_oc = params->group == input->dim[c] ? kernel->dim[4] : kernel->dim[0];
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
@@ -59,7 +65,9 @@ int shl_gref_deconv3d_infer_shape(struct csinn_tensor *input, struct csinn_tenso
     int32_t dilation_h = params->dilation_height;
     int32_t dilation_w = params->dilation_width;
 
-    output->dim_count = input->dim_count;
+    output->dim_count = 5;
+    output->dim[0] = input->dim[0];
+    output->dim[c] = kernel_oc;
     output->dim[d] = (in_d - 1) * stride_d - padding_d + dilation_d * (kernel_d - 1) + 1;
     output->dim[h] = (in_h - 1) * stride_h - padding_h + dilation_h * (kernel_h - 1) + 1;
     output->dim[w] = (in_w - 1) * stride_w - padding_w + dilation_w * (kernel_w - 1) + 1;
diff --git a/source/graph_ref/depth_to_space.c b/source/graph_ref/depth_to_space.c
index ddfba1a6..4d2dc6f9 100644
--- a/source/graph_ref/depth_to_space.c
+++ b/source/graph_ref/depth_to_space.c
@@ -30,11 +30,12 @@ int shl_gref_depth_to_space_infer_shape(struct csinn_tensor *input, struct csinn
                                         struct csinn_depth_to_space_params *params)
 {
     int h, w, c;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
         w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
+    } else if (input->layout == CSINN_LAYOUT_NHWC) {
         h = 1;
         w = 2;
         c = 3;
diff --git a/source/graph_ref/expand_dims.c b/source/graph_ref/expand_dims.c
index d57ed9a8..ab270138 100644
--- a/source/graph_ref/expand_dims.c
+++ b/source/graph_ref/expand_dims.c
@@ -28,6 +28,7 @@ int shl_gref_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output
 int shl_gref_expand_dims_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                      struct csinn_expand_dims_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count + 1;
     if (params->axis == -1) {
         for (int i = 0; i < input->dim_count; i++) {
diff --git a/source/graph_ref/flatten.c b/source/graph_ref/flatten.c
index 8e0f21d3..7873394d 100644
--- a/source/graph_ref/flatten.c
+++ b/source/graph_ref/flatten.c
@@ -28,6 +28,7 @@ int shl_gref_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_gref_flatten_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_flatten_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     int in_size = 1;
     for (int i = 0; i < input->dim_count; i++) {
         in_size *= input->dim[i];
diff --git a/source/graph_ref/fullyconnected.c b/source/graph_ref/fullyconnected.c
index c7756efc..f2dd77de 100644
--- a/source/graph_ref/fullyconnected.c
+++ b/source/graph_ref/fullyconnected.c
@@ -30,6 +30,7 @@ int shl_gref_fullyconnected_infer_shape(struct csinn_tensor *input, struct csinn
                                         struct csinn_tensor *weights, struct csinn_tensor *bias,
                                         struct csinn_fc_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count;
     for (int i = 0; i < input->dim_count; i++) {
         output->dim[i] = input->dim[i];
diff --git a/source/graph_ref/gather.c b/source/graph_ref/gather.c
index 9d6cd324..be4b3004 100644
--- a/source/graph_ref/gather.c
+++ b/source/graph_ref/gather.c
@@ -28,6 +28,7 @@ int shl_gref_gather(struct csinn_tensor *input, struct csinn_tensor *indices,
 int shl_gref_gather_infer_shape(struct csinn_tensor *input, struct csinn_tensor *indices,
                                 struct csinn_tensor *output, struct csinn_gather_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     int32_t axis = params->axis;
     int32_t indices_dim_count = indices->dim_count;
     // if indices is a single number
diff --git a/source/graph_ref/im2col.c b/source/graph_ref/im2col.c
index da50744c..55234c1d 100644
--- a/source/graph_ref/im2col.c
+++ b/source/graph_ref/im2col.c
@@ -30,15 +30,17 @@ int shl_gref_im2col_infer_shape(struct csinn_tensor *input, struct csinn_tensor
                                 struct csinn_im2col_params *params)
 {
     int c, h, w;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
         w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
+    } else if (input->layout == CSINN_LAYOUT_NHWC) {
         h = 1;
         w = 2;
         c = 3;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
diff --git a/source/graph_ref/instance_norm.c b/source/graph_ref/instance_norm.c
index f6e68b72..2c2b6ff8 100644
--- a/source/graph_ref/instance_norm.c
+++ b/source/graph_ref/instance_norm.c
@@ -30,6 +30,7 @@ int shl_gref_instance_norm_infer_shape(struct csinn_tensor *input, struct csinn_
                                        struct csinn_tensor *bias, struct csinn_tensor *output,
                                        struct csinn_instance_norm_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count;
     for (int i = 0; i < input->dim_count; i++) {
         output->dim[i] = input->dim[i];
diff --git a/source/graph_ref/matmul.c b/source/graph_ref/matmul.c
index 53b549d7..ee6181ca 100644
--- a/source/graph_ref/matmul.c
+++ b/source/graph_ref/matmul.c
@@ -29,9 +29,25 @@ int shl_gref_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
 int shl_gref_matmul_infer_shape(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                                 struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
-    output->dim_count = mat0->dim_count;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(mat0);
+    shl_tensor_try_nc1xc0_to_ndarray_shape(mat1);
+
+    output->dim_count = mat0->dim_count > mat1->dim_count ? mat0->dim_count : mat1->dim_count;
     for (int i = 0; i < output->dim_count - 2; i++) {
-        output->dim[i] = mat0->dim[i];
+        const int d1 = mat0->dim_count - 3 - i;
+        const int d2 = mat1->dim_count - 3 - i;
+        const int s1 = d1 >= 0 ? mat0->dim[d1] : 1;
+        const int s2 = d2 >= 0 ? mat1->dim[d2] : 1;
+        if (s1 == s2) {
+            output->dim[output->dim_count - 3 - i] = s1;
+        } else if (s1 == 1) {
+            output->dim[output->dim_count - 3 - i] = s2;
+        } else if (s2 == 1) {
+            output->dim[output->dim_count - 3 - i] = s1;
+        } else {
+            shl_debug_error("%s: Invalid shapes for matmul broadcast!\n", __func__);
+            return CSINN_FALSE;
+        }
     }
     output->dim[output->dim_count - 2] = mat0->dim[mat0->dim_count - (params->trans_a ? 1 : 2)];
     output->dim[output->dim_count - 1] = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
diff --git a/source/graph_ref/one_hot.c b/source/graph_ref/one_hot.c
index 63cb6388..a04ff52b 100644
--- a/source/graph_ref/one_hot.c
+++ b/source/graph_ref/one_hot.c
@@ -28,6 +28,7 @@ int shl_gref_one_hot(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_gref_one_hot_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_one_hot_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count + 1;
     if (params->axis == -1) {
         for (int i = 0; i < input->dim_count; i++) {
diff --git a/source/graph_ref/pad.c b/source/graph_ref/pad.c
index 580f3023..756a0349 100644
--- a/source/graph_ref/pad.c
+++ b/source/graph_ref/pad.c
@@ -29,16 +29,7 @@ int shl_gref_pad_infer_shape(struct csinn_tensor *input, struct csinn_tensor *ou
                              struct csinn_pad_params *params)
 {
     int h, w;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        h = 2;
-        w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
-        h = 1;
-        w = 2;
-    } else {
-        return CSINN_UNSUPPORT_LAYOUT;
-    }
-
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count;
     for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = input->dim[i] + params->pad_before[i] + params->pad_after[i];
diff --git a/source/graph_ref/sequence_mask.c b/source/graph_ref/sequence_mask.c
index edaee19c..c07ca5e6 100644
--- a/source/graph_ref/sequence_mask.c
+++ b/source/graph_ref/sequence_mask.c
@@ -29,6 +29,8 @@ int shl_gref_sequence_mask_infer_shape(struct csinn_tensor *input0, struct csinn
                                        struct csinn_tensor *output,
                                        struct csinn_sequence_mask_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input0);
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input1);
     int maxlen = 0;
     if (input1->dim_count > 0) {
         int32_t *input1_data = (int32_t *)input1->data;
diff --git a/source/graph_ref/setup.c b/source/graph_ref/setup.c
index 1fbff724..7f62f5c1 100644
--- a/source/graph_ref/setup.c
+++ b/source/graph_ref/setup.c
@@ -17,8 +17,8 @@
  */
 
 #include "shl_gref.h"
-#include "shl_tvmgen.h"
 #include "shl_utils.h"
+#include "tvmgen/shl_tvmgen.h"
 
 void shl_gref_set_output_number(int number, struct csinn_session *sess)
 {
@@ -590,31 +590,46 @@ static void graph_match_session(struct shl_ref_graph *graph, struct csinn_sessio
     }
 }
 
+static int find_layer_index_by_name(char *name, struct shl_node **layers, int len)
+{
+    for (int i = 0; i < len; i++) {
+        for (int j = 0; j < layers[i]->out_num; j++) {
+            if (strcmp(layers[i]->out[j]->name, name) == 0) {
+                return i;
+            }
+        }
+    }
+    return -1;
+}
+
 /* use tensor name to match same */
 static void merge_output(struct shl_ref_graph *ggraph, struct shl_ref_graph **sgraphs,
                          int subgraph_num)
 {
     for (int i = 0; i < subgraph_num; i++) {
         struct shl_ref_graph *sgraph = sgraphs[i];
-        /* sub graph last layer */
-        struct shl_node *snode = sgraph->layer[sgraph->layer_index - 2];
-        for (int j = 0; j < snode->out_num; j++) {
-            char *sname = snode->out[j]->name;
-            /* match layer input */
-            for (int k = 0; k < ggraph->layer_index; k++) {
-                struct shl_node *glayer = ggraph->layer[k];
-                /* TODO: free node in ggraph */
-                for (int m = 0; m < glayer->in_num; m++) {
-                    if (strcmp(glayer->in[m]->name, sname) == 0) {
-                        glayer->in[m] = snode->out[j];
+        for (int l = 0; l < sgraph->output_num; l++) {
+            int slayer_index = find_layer_index_by_name(sgraph->output[l]->name, sgraph->layer,
+                                                        sgraph->layer_index);
+            struct shl_node *slayer = sgraph->layer[slayer_index];
+            for (int j = 0; j < slayer->out_num; j++) {
+                char *sname = slayer->out[j]->name;
+                /* match layer input */
+                for (int k = 0; k < ggraph->layer_index; k++) {
+                    struct shl_node *glayer = ggraph->layer[k];
+                    /* TODO: free node in ggraph */
+                    for (int m = 0; m < glayer->in_num; m++) {
+                        if (strcmp(glayer->in[m]->name, sname) == 0) {
+                            glayer->in[m] = slayer->out[j];
+                        }
                     }
                 }
-            }
-            /* match graph output */
-            for (int n = 0; n < ggraph->output_num; n++) {
-                struct shl_node *gnode = ggraph->output[n];
-                if (strcmp(gnode->name, sname) == 0) {
-                    ggraph->output[n] = snode->out[j];
+                /* match graph output */
+                for (int n = 0; n < ggraph->output_num; n++) {
+                    struct shl_node *gnode = ggraph->output[n];
+                    if (strcmp(gnode->name, sname) == 0) {
+                        ggraph->output[n] = slayer->out[j];
+                    }
                 }
             }
         }
@@ -721,6 +736,8 @@ static void session_dynamic_infer_shape(struct csinn_session *sess)
             case CSINN_OP_RELU6:
             case CSINN_OP_SIGMOID:
             case CSINN_OP_SOFTMAX:
+            case CSINN_OP_SQRT:
+            case CSINN_OP_ERF:
                 shl_gref_siso_infer_shape(n->in[0]->data, n->out[0]->data, params);
                 break;
             case CSINN_OP_ADD:
@@ -791,7 +808,12 @@ static void session_dynamic_infer_shape(struct csinn_session *sess)
                 break;
             case CSINN_OP_GLOBAL_AVGPOOL2D:
             case CSINN_OP_GLOBAL_MAXPOOL2D:
-                shl_gref_global_pooling2d_infer_shape(n->in[0]->data, n->out[0]->data, (struct csinn_pool_params *)params);
+                shl_gref_global_pooling2d_infer_shape(n->in[0]->data, n->out[0]->data,
+                                                      (struct csinn_pool_params *)params);
+                break;
+            case CSINN_OP_MEAN:
+                shl_gref_mean_infer_shape(n->in[0]->data, n->out[0]->data,
+                                          (struct csinn_reduce_params *)params);
                 break;
             default:
                 shl_debug_error("[infer_shape]:unknown op %d\n", n->type);
@@ -866,6 +888,12 @@ int shl_gref_session_run(struct csinn_session *sess)
 #ifdef SHL_LAYER_BENCHMARK
                 if (sess->profiler_level == CSINN_PROFILER_LEVEL_TIMER ||
                     sess->profiler_level == CSINN_PROFILER_LEVEL_ALL) {
+                    // warm-up
+                    int warm_count = 3;
+                    for (int t = 0; t < warm_count; t++) {
+                        shl_subgraph_run(n);
+                    }
+
                     uint64_t start_time = shl_get_timespec();
                     shl_subgraph_run(n);
                     uint64_t end_time = shl_get_timespec();
diff --git a/source/graph_ref/shape.c b/source/graph_ref/shape.c
index a47dd1a3..d4b30a18 100644
--- a/source/graph_ref/shape.c
+++ b/source/graph_ref/shape.c
@@ -28,6 +28,7 @@ int shl_gref_shape(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_gref_shape_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_shape_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count;
     for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = 1;
diff --git a/source/graph_ref/slice.c b/source/graph_ref/slice.c
index 161da709..44cc8ab7 100644
--- a/source/graph_ref/slice.c
+++ b/source/graph_ref/slice.c
@@ -28,6 +28,7 @@ int shl_gref_slice(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_gref_slice_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_slice_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count;
     for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = params->end[i] - params->begin[i];
diff --git a/source/graph_ref/space_to_batch.c b/source/graph_ref/space_to_batch.c
index 53368619..9e974a1a 100644
--- a/source/graph_ref/space_to_batch.c
+++ b/source/graph_ref/space_to_batch.c
@@ -30,6 +30,7 @@ int shl_gref_space_to_batch_infer_shape(struct csinn_tensor *input, struct csinn
                                         struct csinn_space_to_batch_params *params)
 {
     int h, w, c;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     if (output->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
@@ -39,6 +40,7 @@ int shl_gref_space_to_batch_infer_shape(struct csinn_tensor *input, struct csinn
         w = 2;
         c = 3;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
diff --git a/source/graph_ref/space_to_batch_nd.c b/source/graph_ref/space_to_batch_nd.c
index 32d50026..08c89460 100644
--- a/source/graph_ref/space_to_batch_nd.c
+++ b/source/graph_ref/space_to_batch_nd.c
@@ -29,6 +29,7 @@ int shl_gref_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *
 int shl_gref_space_to_batch_nd_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                            struct csinn_space_to_batch_nd_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     int32_t block_size = 1;
     for (int i = 0; i < params->spatial_dim_cnt; i++) {
         block_size *= params->block_shape[i];
diff --git a/source/graph_ref/space_to_depth.c b/source/graph_ref/space_to_depth.c
index 2119573a..172a56c5 100644
--- a/source/graph_ref/space_to_depth.c
+++ b/source/graph_ref/space_to_depth.c
@@ -30,6 +30,7 @@ int shl_gref_space_to_depth_infer_shape(struct csinn_tensor *input, struct csinn
                                         struct csinn_space_to_depth_params *params)
 {
     int h, w, c;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     if (output->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
@@ -39,6 +40,7 @@ int shl_gref_space_to_depth_infer_shape(struct csinn_tensor *input, struct csinn
         w = 2;
         c = 3;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
diff --git a/source/graph_ref/split.c b/source/graph_ref/split.c
index 195f9330..42def37b 100644
--- a/source/graph_ref/split.c
+++ b/source/graph_ref/split.c
@@ -41,6 +41,7 @@ int shl_gref_split(struct csinn_tensor *input, struct csinn_tensor **output,
 int shl_gref_split_infer_shape(struct csinn_tensor *input, struct csinn_tensor **output,
                                struct csinn_split_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     int32_t axis = params->axis;
     int32_t out_num = params->output_num;
     int32_t *split_index = params->split_index;
diff --git a/source/graph_ref/squeeze.c b/source/graph_ref/squeeze.c
index 1fa5119b..8845e11c 100644
--- a/source/graph_ref/squeeze.c
+++ b/source/graph_ref/squeeze.c
@@ -28,6 +28,7 @@ int shl_gref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_gref_squeeze_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_squeeze_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     if (params->axis_num == -1) {
         int j = 0;
         for (int i = 0; i < input->dim_count; i++) {
diff --git a/source/graph_ref/strided_slice.c b/source/graph_ref/strided_slice.c
index ba1565f4..b5a4e26c 100644
--- a/source/graph_ref/strided_slice.c
+++ b/source/graph_ref/strided_slice.c
@@ -28,6 +28,7 @@ int shl_gref_strided_slice(struct csinn_tensor *input, struct csinn_tensor *outp
 int shl_gref_strided_slice_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_strided_slice_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     for (int i = 0; i < params->slice_count; i++) {
         if (params->begin[i] < -input->dim[i]) params->begin[i] = -input->dim[i];
         if (params->begin[i] < 0) params->begin[i] += input->dim[i];
diff --git a/source/graph_ref/subgraph.c b/source/graph_ref/subgraph.c
index 08f1ed16..61e19d9e 100644
--- a/source/graph_ref/subgraph.c
+++ b/source/graph_ref/subgraph.c
@@ -517,6 +517,7 @@ int shl_subgraph_setup(struct shl_node *n)
             case CSINN_OP_UNPOOLING:
             case CSINN_OP_UNSTACK:
             case CSINN_OP_YUV_RGB_SCALE:
+            case CSINN_OP_DATA_CONVERT:
                 output = node->out[0]->data;
                 output->sess = sub_sess;
                 ret = func(input0, output, params);
diff --git a/source/graph_ref/tile.c b/source/graph_ref/tile.c
index c117f8b4..164188d4 100644
--- a/source/graph_ref/tile.c
+++ b/source/graph_ref/tile.c
@@ -28,6 +28,7 @@ int shl_gref_tile(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_gref_tile_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_tile_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count;
     for (int i = 0; i < params->reps_num; i++) {
         output->dim[i] = input->dim[i] * params->reps[i];
diff --git a/source/graph_ref/topk.c b/source/graph_ref/topk.c
index 6f3a6047..4875e407 100644
--- a/source/graph_ref/topk.c
+++ b/source/graph_ref/topk.c
@@ -28,6 +28,7 @@ int shl_gref_topk(struct csinn_tensor *input, struct csinn_tensor *output1,
 int shl_gref_topk_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output1,
                               struct csinn_tensor *output2, struct csinn_topk_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output1->dim_count = input->dim_count;
     output2->dim_count = input->dim_count;
     for (int i = 0; i < input->dim_count - 1; i++) {
diff --git a/source/graph_ref/transpose.c b/source/graph_ref/transpose.c
index a77f49ee..f8c3cd7c 100644
--- a/source/graph_ref/transpose.c
+++ b/source/graph_ref/transpose.c
@@ -28,6 +28,7 @@ int shl_gref_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_gref_transpose_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                    struct csinn_transpose_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     int32_t *perm = params->permute;
     output->dim_count = input->dim_count;
     for (int i = 0; i < params->permute_num; i++) {
diff --git a/source/graph_ref/utils.c b/source/graph_ref/utils.c
index 4e43145d..c29c345a 100644
--- a/source/graph_ref/utils.c
+++ b/source/graph_ref/utils.c
@@ -94,6 +94,8 @@ int shl_gref_sidcso_op(struct csinn_tensor *input, struct csinn_tensor *output,
 
 int shl_gref_siso_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output, void *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    output->layout = input->layout;
     output->dim_count = input->dim_count;
     for (int i = 0; i < input->dim_count; i++) {
         output->dim[i] = input->dim[i];
@@ -104,6 +106,8 @@ int shl_gref_siso_infer_shape(struct csinn_tensor *input, struct csinn_tensor *o
 int shl_gref_diso_infer_shape(struct csinn_tensor *input0, struct csinn_tensor *input1,
                               struct csinn_tensor *output, void *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input0);
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input1);
     int32_t dim_count =
         input0->dim_count > input1->dim_count ? input0->dim_count : input1->dim_count;
 
@@ -123,6 +127,7 @@ int shl_gref_diso_infer_shape(struct csinn_tensor *input0, struct csinn_tensor *
             return CSINN_FALSE;
         }
     }
+    output->layout = input0->dim_count >= input1->dim_count ? input0->layout : input1->layout;
     output->dim_count = dim_count;
     return CSINN_TRUE;
 }
@@ -130,18 +135,18 @@ int shl_gref_diso_infer_shape(struct csinn_tensor *input0, struct csinn_tensor *
 int shl_gref_pooling2d_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                    struct csinn_pool_params *params)
 {
-    int n, c, h, w;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
-        n = 0;
+    int c, h, w;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
         w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
-        n = 0;
+    } else if (input->layout == CSINN_LAYOUT_NHWC) {
         h = 1;
         w = 2;
         c = 3;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
@@ -160,8 +165,9 @@ int shl_gref_pooling2d_infer_shape(struct csinn_tensor *input, struct csinn_tens
         ceil_h = stride_h - 1;
         ceil_w = stride_w - 1;
     }
-    output->dim_count = input->dim_count;
-    output->dim[n] = input->dim[n];
+    output->layout = input->layout;
+    output->dim_count = 4;
+    output->dim[0] = input->dim[0];
     output->dim[c] = input->dim[c];
     output->dim[h] = (in_h + padding_h - kernel_h + ceil_h) / stride_h + 1;
     output->dim[w] = (in_w + padding_w - kernel_w + ceil_w) / stride_w + 1;
@@ -172,16 +178,20 @@ int shl_gref_pooling2d_infer_shape(struct csinn_tensor *input, struct csinn_tens
 int shl_gref_pooling3d_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                    struct csinn_pool_params *params)
 {
-    int d, h, w;
-    if (output->layout == CSINN_LAYOUT_NCDHW) {
+    int c, d, h, w;
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCDHW) {
+        c = 1;
         d = 2;
         h = 3;
         w = 4;
-    } else if (output->layout == CSINN_LAYOUT_NDHWC) {
+    } else if (input->layout == CSINN_LAYOUT_NDHWC) {
         d = 1;
         h = 2;
         w = 3;
+        c = 4;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
 
@@ -206,7 +216,10 @@ int shl_gref_pooling3d_infer_shape(struct csinn_tensor *input, struct csinn_tens
         ceil_h = stride_h - 1;
         ceil_w = stride_w - 1;
     }
-    output->dim_count = input->dim_count;
+    output->layout = input->layout;
+    output->dim_count = 5;
+    output->dim[0] = input->dim[0];
+    output->dim[c] = input->dim[c];
     output->dim[d] = (in_d + padding_d - kernel_d + ceil_d) / stride_d + 1;
     output->dim[h] = (in_h + padding_h - kernel_h + ceil_h) / stride_h + 1;
     output->dim[w] = (in_w + padding_w - kernel_w + ceil_w) / stride_w + 1;
@@ -218,18 +231,21 @@ int shl_gref_global_pooling2d_infer_shape(struct csinn_tensor *input, struct csi
                                           struct csinn_pool_params *params)
 {
     int c, h, w;
-    if (output->layout == CSINN_LAYOUT_NCHW) {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
+    if (input->layout == CSINN_LAYOUT_NCHW) {
         c = 1;
         h = 2;
         w = 3;
-    } else if (output->layout == CSINN_LAYOUT_NHWC) {
+    } else if (input->layout == CSINN_LAYOUT_NHWC) {
         h = 1;
         w = 2;
         c = 3;
     } else {
+        shl_debug_error("%s: Invalid input tensor layout!\n", __func__);
         return CSINN_UNSUPPORT_LAYOUT;
     }
-    output->dim_count = input->dim_count;
+    output->layout = input->layout;
+    output->dim_count = 4;
     output->dim[0] = input->dim[0];
     output->dim[c] = input->dim[c];
     output->dim[h] = 1;
@@ -241,6 +257,7 @@ int shl_gref_global_pooling2d_infer_shape(struct csinn_tensor *input, struct csi
 int shl_gref_reduce_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_reduce_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     if (params->axis[0] == -1) {
         output->dim_count = 1;
         output->dim[0] = 1;
@@ -260,6 +277,7 @@ int shl_gref_reduce_infer_shape(struct csinn_tensor *input, struct csinn_tensor
 int shl_gref_segment_infer_shape(struct csinn_tensor *input0, struct csinn_tensor *input1,
                                  struct csinn_tensor *output, struct csinn_segment_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input0);
     output->dim_count = input0->dim_count;
     output->dim[0] = params->num_segments;
     for (int i = 1; i < output->dim_count; i++) {
@@ -271,6 +289,7 @@ int shl_gref_segment_infer_shape(struct csinn_tensor *input0, struct csinn_tenso
 int shl_gref_stride_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_reduce_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(input);
     output->dim_count = input->dim_count - 1;
     for (int i = 0; i < input->dim_count; i++) {
         if (i < params->axis[0]) {
@@ -281,3 +300,23 @@ int shl_gref_stride_infer_shape(struct csinn_tensor *input, struct csinn_tensor
     }
     return CSINN_TRUE;
 }
+
+void shl_tensor_try_nc1xc0_to_ndarray_shape(struct csinn_tensor *t)
+{
+    if (t->layout >= CSINN_LAYOUT_NC1C0 && t->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        int in_c1 = t->dim[1];
+        int in_c0 = t->dim[t->dim_count - 1];
+        t->dim[1] = in_c1 * in_c0;
+        t->dim[t->dim_count - 1] = 0;
+        t->dim_count = t->dim_count - 1;
+    }
+    if (t->layout == CSINN_LAYOUT_NC1DHWC0) {
+        t->layout = CSINN_LAYOUT_NCDHW;
+    } else if (t->layout == CSINN_LAYOUT_NC1HWC0) {
+        t->layout = CSINN_LAYOUT_NCHW;
+    } else if (t->layout == CSINN_LAYOUT_NC1WC0) {
+        t->layout = CSINN_LAYOUT_NCW;
+    } else if (t->layout == CSINN_LAYOUT_NC1C0) {
+        t->layout = CSINN_LAYOUT_NC;
+    }
+}
diff --git a/source/graph_ref/where.c b/source/graph_ref/where.c
index bfeb6f2d..c63ec03a 100644
--- a/source/graph_ref/where.c
+++ b/source/graph_ref/where.c
@@ -51,6 +51,10 @@ int shl_gref_where_infer_shape(struct csinn_tensor *condition, struct csinn_tens
                                struct csinn_tensor *y, struct csinn_tensor *output,
                                struct csinn_where_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(condition);
+    shl_tensor_try_nc1xc0_to_ndarray_shape(x);
+    shl_tensor_try_nc1xc0_to_ndarray_shape(y);
+
     if (x->data == NULL || y->data == NULL) {
         // Return the indices of non-zero elements
         int c_size = 1;
diff --git a/source/graph_ref/where_softmax.c b/source/graph_ref/where_softmax.c
index 3ed60d24..e2ac8583 100644
--- a/source/graph_ref/where_softmax.c
+++ b/source/graph_ref/where_softmax.c
@@ -44,6 +44,8 @@ int shl_gref_where_softmax_infer_shape(struct csinn_tensor *condition, struct cs
                                        struct csinn_tensor *output,
                                        struct csinn_where_softmax_params *params)
 {
+    shl_tensor_try_nc1xc0_to_ndarray_shape(condition);
+    shl_tensor_try_nc1xc0_to_ndarray_shape(y);
     int shape_rank = 0;
     shape_rank = condition->dim_count > shape_rank ? condition->dim_count : shape_rank;
     shape_rank = y->dim_count > shape_rank ? y->dim_count : shape_rank;
diff --git a/source/nn2/format.c b/source/nn2/format.c
index 94d2b81b..3b735188 100644
--- a/source/nn2/format.c
+++ b/source/nn2/format.c
@@ -20,12 +20,12 @@
 #include "shl_gref.h"
 #include "shl_utils.h"
 
-char *shl_bm_header_str()
+void shl_bm_header_str(char *buffer)
 {
-    static char ret_str[4096] =
+    static char ret_str[96] =
         "Heterogeneous Honey Badger binary model\n\nbinary model version 2.0\n\nHHB_VERSION ";
-    csinn_version(ret_str + 79);
-    return ret_str;
+    memcpy(buffer, ret_str, 79);
+    csinn_version(buffer + 79);
 }
 
 float check_bm_version(char *header_str)
@@ -42,7 +42,9 @@ float check_bm_version(char *header_str)
 
 void shl_dump_bm_header(FILE *f)
 {
-    char *header = shl_bm_header_str();
+    /* make sure all memory is set to zero. */
+    char *header = shl_mem_calloc(4096, 1);
+    shl_bm_header_str(header);
     fwrite(header, 1, 4096, f);
 }
 
@@ -525,7 +527,8 @@ static char *layer_data_dump(struct shl_node *layer, int *size)
         int slice_size = stride_slice_params->slice_count * sizeof(int32_t);
         ret = shl_mem_realloc(ret, extend_size + slice_size * 3, extend_size);
 
-        struct csinn_strided_slice_params *ret_stride_slice_params = (struct csinn_strided_slice_params *)ret;
+        struct csinn_strided_slice_params *ret_stride_slice_params =
+            (struct csinn_strided_slice_params *)ret;
         ret_stride_slice_params->begin = (int32_t *)offset_to_ptr(extend_size);
         memcpy((char *)ret + extend_size, stride_slice_params->begin, slice_size);
         ret_stride_slice_params->end = (int32_t *)offset_to_ptr(extend_size + slice_size);
@@ -942,6 +945,333 @@ int shl_dump_bm_graph_struct_section(FILE *f, struct shl_ref_graph *ggraph)
     return size;
 }
 
+#ifdef SHL_EXPORT_MODEL
+void shl_export_model_print(struct csinn_session *sess)
+{
+    static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE];
+    memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE);
+
+    cb_map[CSINN_OP_ADD].est = csinn_add;
+    cb_map[CSINN_OP_ARGMAX].est = csinn_argmax;
+    cb_map[CSINN_OP_AVGPOOL2D].est = csinn_avgpool2d;
+    cb_map[CSINN_OP_BN].est = csinn_batch_normalization;
+    cb_map[CSINN_OP_BATCH_TO_SPACE_ND].est = csinn_batch_to_space_nd;
+    cb_map[CSINN_OP_CONCAT].est = csinn_concat;
+    cb_map[CSINN_OP_CONV2D].est = csinn_conv2d;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D].est = csinn_depthwise_conv2d;
+    cb_map[CSINN_OP_GROUP_CONV2D].est = csinn_group_conv2d;
+    cb_map[CSINN_OP_CROP].est = csinn_crop;
+    cb_map[CSINN_OP_DECONV2D].est = csinn_deconv2d;
+    cb_map[CSINN_OP_DEPTH_TO_SPACE].est = csinn_depth_to_space;
+    cb_map[CSINN_OP_DIV].est = csinn_div;
+    cb_map[CSINN_OP_FLATTEN].est = csinn_flatten;
+    cb_map[CSINN_OP_FULLYCONNECTED].est = csinn_fullyconnected;
+    cb_map[CSINN_OP_GLOBAL_AVGPOOL2D].est = csinn_global_avgpool2d;
+    cb_map[CSINN_OP_GLOBAL_MAXPOOL2D].est = csinn_global_maxpool2d;
+    cb_map[CSINN_OP_L2N].est = csinn_l2_normalization;
+    cb_map[CSINN_OP_LEAKY_RELU].est = csinn_leaky_relu;
+    cb_map[CSINN_OP_LRN].est = csinn_lrn;
+    cb_map[CSINN_OP_MAXIMUM].est = csinn_maximum;
+    cb_map[CSINN_OP_MAXPOOL2D].est = csinn_maxpool2d;
+    cb_map[CSINN_OP_MEAN].est = csinn_mean;
+    cb_map[CSINN_OP_MINIMUM].est = csinn_minimum;
+    cb_map[CSINN_OP_MUL].est = csinn_mul;
+    cb_map[CSINN_OP_NEGATIVE].est = csinn_negative;
+    cb_map[CSINN_OP_PAD].est = csinn_pad;
+    cb_map[CSINN_OP_PRELU].est = csinn_prelu;
+    cb_map[CSINN_OP_RELU].est = csinn_relu;
+    cb_map[CSINN_OP_RELU1].est = csinn_relu1;
+    cb_map[CSINN_OP_RELU6].est = csinn_relu6;
+    cb_map[CSINN_OP_RESHAPE].est = csinn_reshape;
+    cb_map[CSINN_OP_RESIZE].est = csinn_resize;
+    cb_map[CSINN_OP_SIGMOID].est = csinn_sigmoid;
+    cb_map[CSINN_OP_SOFTMAX].est = csinn_softmax;
+    cb_map[CSINN_OP_SPACE_TO_BATCH_ND].est = csinn_space_to_batch_nd;
+    cb_map[CSINN_OP_SPACE_TO_DEPTH].est = csinn_space_to_depth;
+    cb_map[CSINN_OP_SPLIT].est = csinn_split;
+    cb_map[CSINN_OP_SQUEEZE].est = csinn_squeeze;
+    cb_map[CSINN_OP_STRIDED_SLICE].est = csinn_strided_slice;
+    cb_map[CSINN_OP_SUB].est = csinn_sub;
+    cb_map[CSINN_OP_TANH].est = csinn_tanh;
+    cb_map[CSINN_OP_TRANSPOSE].est = csinn_transpose;
+    cb_map[CSINN_OP_ROIPOOL].est = csinn_roipool;
+    cb_map[CSINN_OP_PROPOSAL].est = csinn_proposal;
+    cb_map[CSINN_OP_UNPOOLING].est = csinn_unpooling;
+    cb_map[CSINN_OP_MAXPOOL2D_LOCAT].est = csinn_maxpool2d_locat;
+    cb_map[CSINN_OP_SQRT].est = csinn_sqrt;
+    cb_map[CSINN_OP_MATMUL].est = csinn_matmul;
+    cb_map[CSINN_OP_DATA_CONVERT].est = csinn_data_convert;
+
+    // to print structure info
+    shl_debug_set_level(CSINN_DEBUG_LEVEL_INFO);
+
+    struct shl_ref_graph *g = shl_gref_get_graph(sess);
+    for (int i = 0; i < g->layer_index; i++) {
+        struct shl_node *node = g->layer[i];
+        if (node->type == CSINN_SUBGRAPH) {
+            shl_debug_info("There is a subgrah that is ignored temporarily(TODO)\n");
+        } else if (node->type >= 0 && node->type < CSINN_OP_SIZE) {
+            struct csinn_params_base *params = node->data;
+            struct csinn_callback *cb = params->cb;
+            int (*func)();
+            func = cb_map[node->type].est;
+
+            params->sess->base_run_mode = CSINN_RM_LAYER;
+            cb->exec = NULL;
+
+            struct csinn_tensor **inputs;
+            struct csinn_tensor **outputs;
+            switch (node->type) {
+                case CSINN_OP_ABS:
+                case CSINN_OP_ACOS:
+                case CSINN_OP_ACOSH:
+                case CSINN_OP_ANY:
+                case CSINN_OP_ARGMAX:
+                case CSINN_OP_ARGMIN:
+                case CSINN_OP_ASIN:
+                case CSINN_OP_ASINH:
+                case CSINN_OP_ATAN:
+                case CSINN_OP_ATANH:
+                case CSINN_OP_AVGPOOL2D:
+                case CSINN_OP_AVGPOOL3D:
+                case CSINN_OP_BATCH_TO_SPACE:
+                case CSINN_OP_BATCH_TO_SPACE_ND:
+                case CSINN_OP_BROADCOST:
+                case CSINN_OP_CEIL:
+                case CSINN_OP_CLIP:
+                case CSINN_OP_COL2IM:
+                case CSINN_OP_COS:
+                case CSINN_OP_COSH:
+                case CSINN_OP_CROP:
+                case CSINN_OP_CUMPROD:
+                case CSINN_OP_CUMSUM:
+                case CSINN_OP_DATA_CONVERT:
+                case CSINN_OP_DEPTH_TO_SPACE:
+                case CSINN_OP_ELU:
+                case CSINN_OP_ERF:
+                case CSINN_OP_EXP:
+                case CSINN_OP_EXPAND_DIMS:
+                case CSINN_OP_EXPM1:
+                case CSINN_OP_FLATTEN:
+                case CSINN_OP_FLOOR:
+                case CSINN_OP_GLOBAL_AVGPOOL2D:
+                case CSINN_OP_GLOBAL_MAXPOOL2D:
+                case CSINN_OP_HARD_SIGMOID:
+                case CSINN_OP_IM2COL:
+                case CSINN_OP_ISNAN:
+                case CSINN_OP_L2N:
+                case CSINN_OP_L2POOL2D:
+                case CSINN_OP_LEAKY_RELU:
+                case CSINN_OP_LOG_SOFTMAX:
+                case CSINN_OP_LOG:
+                case CSINN_OP_LOG1P:
+                case CSINN_OP_LOGICAL_NOT:
+                case CSINN_OP_LRN:
+                case CSINN_OP_MAX:
+                case CSINN_OP_MAXPOOL2D:
+                case CSINN_OP_MAXPOOL2D_LOCAT:
+                case CSINN_OP_MAXPOOL3D:
+                case CSINN_OP_MEAN:
+                case CSINN_OP_MEAN_STRIDE:
+                case CSINN_OP_MIN:
+                case CSINN_OP_NDARRAY_SIZE:
+                case CSINN_OP_NEGATIVE:
+                case CSINN_OP_NOT:
+                case CSINN_OP_ONE_HOT:
+                case CSINN_OP_PAD:
+                case CSINN_OP_PROD:
+                case CSINN_OP_REDUCE_LOGSUMEXP:
+                case CSINN_OP_REDUCE_MAX:
+                case CSINN_OP_REDUCE_MEAN:
+                case CSINN_OP_REDUCE_MIN:
+                case CSINN_OP_REDUCE_PROD:
+                case CSINN_OP_REDUCE_SUM:
+                case CSINN_OP_RELU:
+                case CSINN_OP_RELU1:
+                case CSINN_OP_RELU6:
+                case CSINN_OP_RELUN:
+                case CSINN_OP_REORG:
+                case CSINN_OP_RESHAPE:
+                case CSINN_OP_RESIZE:
+                case CSINN_OP_REVERSE:
+                case CSINN_OP_ROUND:
+                case CSINN_OP_RSQRT:
+                case CSINN_OP_SHAPE:
+                case CSINN_OP_SHUFFLE_CHANNEL:
+                case CSINN_OP_SIGMOID:
+                case CSINN_OP_SIGN:
+                case CSINN_OP_SIN:
+                case CSINN_OP_SINH:
+                case CSINN_OP_SLICE:
+                case CSINN_OP_SOFTMAX:
+                case CSINN_OP_SOFTPLUS:
+                case CSINN_OP_SOFTRELU:
+                case CSINN_OP_SOFTSIGN:
+                case CSINN_OP_SPACE_TO_BATCH:
+                case CSINN_OP_SPACE_TO_BATCH_ND:
+                case CSINN_OP_SPACE_TO_DEPTH:
+                case CSINN_OP_SQRT:
+                case CSINN_OP_SQUARE:
+                case CSINN_OP_SQUEEZE:
+                case CSINN_OP_STACK:
+                case CSINN_OP_STRIDED_SLICE:
+                case CSINN_OP_SUM:
+                case CSINN_OP_TAN:
+                case CSINN_OP_TANH:
+                case CSINN_OP_THRESHOLD_RELU:
+                case CSINN_OP_TILE:
+                case CSINN_OP_TRANSPOSE:
+                case CSINN_OP_TRUNC:
+                case CSINN_OP_UNPOOLING:
+                case CSINN_OP_UNSTACK:
+                case CSINN_OP_CAST:
+                case CSINN_OP_YUV_RGB_SCALE:
+                    func(node->in[0]->data, node->out[0]->data, params);
+                    break;
+                case CSINN_OP_ADD:
+                case CSINN_OP_AND:
+                case CSINN_OP_DIV:
+                case CSINN_OP_EQUANL:
+                case CSINN_OP_FLOOR_DIVIDE:
+                case CSINN_OP_FLOOR_MOD:
+                case CSINN_OP_GATHER_ND:
+                case CSINN_OP_GATHER:
+                case CSINN_OP_GREATHER_EQUAL:
+                case CSINN_OP_GREATHER:
+                case CSINN_OP_LESS_EQUAL:
+                case CSINN_OP_LESS:
+                case CSINN_OP_LOGICAL_AND:
+                case CSINN_OP_LOGICAL_OR:
+                case CSINN_OP_LOGICAL_XOR:
+                case CSINN_OP_MATMUL:
+                case CSINN_OP_MAXIMUM:
+                case CSINN_OP_MINIMUM:
+                case CSINN_OP_MOD:
+                case CSINN_OP_MUL:
+                case CSINN_OP_NON_MAX_SUPPRESSION:
+                case CSINN_OP_NOT_EQUAL:
+                case CSINN_OP_OR:
+                case CSINN_OP_POWER:
+                case CSINN_OP_PRELU:
+                case CSINN_OP_SEQUENCE_MASK:
+                case CSINN_OP_SEGMENT_MAX:
+                case CSINN_OP_UNSORTED_SEGMENT_MAX:
+                case CSINN_OP_SEGMENT_MEAN:
+                case CSINN_OP_UNSORTED_SEGMENT_MEAN:
+                case CSINN_OP_SEGMENT_MIN:
+                case CSINN_OP_UNSORTED_SEGMENT_MIN:
+                case CSINN_OP_SEGMENT_PROD:
+                case CSINN_OP_UNSORTED_SEGMENT_PROD:
+                case CSINN_OP_SEGMENT_SUM:
+                case CSINN_OP_UNSORTED_SEGMENT_SUM:
+                case CSINN_OP_SUB:
+                case CSINN_OP_XOR:
+                    func(node->in[0]->data, node->in[1]->data, node->out[0]->data, params);
+                    break;
+                case CSINN_OP_CONV1D:
+                case CSINN_OP_CONV2D:
+                case CSINN_OP_CONV2D_RELU:
+                case CSINN_OP_CONV2D_RELU6:
+                case CSINN_OP_CONV2D_CHANNEL:
+                case CSINN_OP_CONV2D_CHANNEL_RELU:
+                case CSINN_OP_CONV2D_CHANNEL_RELU6:
+                case CSINN_OP_DEPTHWISE_CONV1D:
+                case CSINN_OP_DEPTHWISE_CONV2D:
+                case CSINN_OP_DEPTHWISE_CONV2D_RELU:
+                case CSINN_OP_DEPTHWISE_CONV2D_RELU6:
+                case CSINN_OP_DEPTHWISE_CONV2D_CHANNEL:
+                case CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU:
+                case CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6:
+                case CSINN_OP_GROUP_CONV2D:
+                case CSINN_OP_GROUP_CONV2D_RELU:
+                case CSINN_OP_GROUP_CONV2D_RELU6:
+                case CSINN_OP_GROUP_CONV2D_CHANNEL:
+                case CSINN_OP_GROUP_CONV2D_CHANNEL_RELU:
+                case CSINN_OP_CONV3D:
+                case CSINN_OP_DECONV2D:
+                case CSINN_OP_DEPTHWISE_DECONV2D:
+                case CSINN_OP_GROUP_DECONV2D:
+                case CSINN_OP_DECONV3D:
+                case CSINN_OP_FULLYCONNECTED:
+                case CSINN_OP_LAYER_NORM:
+                case CSINN_OP_CACHE_MATMUL:
+                case CSINN_OP_CACHE_CONV1D:
+                    func(node->in[0]->data, node->out[0]->data, node->in[1]->data,
+                         node->in[2]->data, params);
+                    break;
+                case CSINN_OP_FSMN:
+                    func(node->in[0]->data, node->in[1]->data, node->in[2]->data, node->in[3]->data,
+                         node->in[4]->data, node->out[0]->data, params);
+                    break;
+                case CSINN_OP_CONCAT:
+                    inputs = shl_mem_alloc(sizeof(struct csinn_tensor *) *
+                                           ((struct csinn_concat_params *)params)->inputs_count);
+                    for (int i = 0; i < ((struct csinn_concat_params *)params)->inputs_count; i++) {
+                        inputs[i] = node->in[i]->data;
+                    }
+                    func(inputs, node->out[0]->data, params);
+                    shl_mem_free(inputs);
+                    break;
+                case CSINN_OP_SPLIT:
+                    outputs = shl_mem_alloc(sizeof(struct csinn_tensor *) *
+                                            ((struct csinn_split_params *)params)->output_num);
+                    for (int i = 0; i < ((struct csinn_split_params *)params)->output_num; i++) {
+                        outputs[i] = node->out[i]->data;
+                    }
+                    func(node->in[0]->data, outputs, params);
+                    shl_mem_free(outputs);
+                    break;
+                case CSINN_OP_WHERE:
+                    func(node->in[0]->data, node->in[1]->data, node->in[2]->data,
+                         node->out[0]->data, params);
+                    break;
+                case CSINN_OP_WHERE_SOFTMAX:
+                    func(node->in[0]->data, node->in[1]->data, node->out[0]->data, params);
+                    break;
+                case CSINN_OP_ALL:
+                    shl_debug_error("unsupported CSINN_OP_ALL\n");
+                    break;
+                case CSINN_OP_ARANGE:
+                    shl_debug_error("unsupported CSINN_OP_ARANGE\n");
+                    break;
+                case CSINN_OP_BN:
+                    shl_debug_error("unsupported CSINN_OP_BN\n");
+                    break;
+                case CSINN_OP_MIN_STRIDE:
+                    shl_debug_error("unsupported CSINN_OP_MIN_STRIDE\n");
+                    break;
+                case CSINN_OP_PROPOSAL:
+                    shl_debug_error("unsupported CSINN_OP_PROPOSAL\n");
+                    break;
+                case CSINN_OP_PSROIPOOLING:
+                    shl_debug_error("unsupported CSINN_OP_PSROIPOOLING\n");
+                    break;
+                case CSINN_OP_ROIALIGN:
+                    shl_debug_error("unsupported CSINN_OP_ROIALIGN\n");
+                    break;
+                case CSINN_OP_ROIPOOL:
+                    shl_debug_error("unsupported CSINN_OP_ROIPOOL\n");
+                    break;
+                case CSINN_OP_SCATTER_ND:
+                    shl_debug_error("unsupported CSINN_OP_SCATTER_ND\n");
+                    break;
+                case CSINN_OP_SELECT:
+                    shl_debug_error("unsupported CSINN_OP_SELECT\n");
+                    break;
+                case CSINN_OP_TOPK:
+                    shl_debug_error("unsupported CSINN_OP_TOPK\n");
+                    break;
+                default:
+                    shl_debug_error("unknown op\n");
+            }
+        }
+    }
+
+    // restore debug level
+    shl_debug_set_level(sess->debug_level);
+}
+#endif
+
 /**
  * @addtogroup SESSION
  * @{
diff --git a/source/nn2/setup.c b/source/nn2/setup.c
index 00672e82..da30b0a3 100644
--- a/source/nn2/setup.c
+++ b/source/nn2/setup.c
@@ -29,6 +29,7 @@ void shl_target_init_rvv();
 void shl_target_init_rvm();
 void shl_target_init_e907();
 void shl_target_init_c920();
+void shl_target_init_c920v2();
 
 static int __shl_has_init;
 
@@ -64,6 +65,9 @@ void shl_init()
 #ifdef SHL_BUILD_C920
     shl_target_init_c920();
 #endif
+#ifdef SHL_BUILD_C920V2
+    shl_target_init_c920v2();
+#endif
 }
 
 /**
diff --git a/source/nn2/utils.c b/source/nn2/utils.c
index 351e8517..c84341fe 100644
--- a/source/nn2/utils.c
+++ b/source/nn2/utils.c
@@ -662,6 +662,28 @@ static void nhwc_int4_to_float(struct csinn_tensor *dest, struct csinn_tensor *s
     }
 }
 
+static void int4_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    int8_t *src_data = src->data;
+    float *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        int in_index = i / 2;
+        int8_t src_tmp = 0;
+        float ret = 0;
+        /* int4 little endian */
+        if (i % 2) {
+            src_tmp = src_data[in_index] & 0xf0;
+            ret = int4_to_float_base(src_tmp >> 4, src, 0);
+        } else {
+            src_tmp = (src_data[in_index] & 0xf) << 4;
+            ret = int4_to_float_base(src_tmp >> 4, src, 0);
+        }
+        dest_data[i] = ret;
+    }
+}
+
 static void nchw_float_to_int4(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                int inner_size)
 {
@@ -704,6 +726,24 @@ static void nhwc_float_to_int4(struct csinn_tensor *dest, struct csinn_tensor *s
     }
 }
 
+static void float_to_int4(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    float *src_data = src->data;
+    int8_t *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        int input_val = float_to_int4_base(src_data[i], dest, 0);
+        int out_index = i / 2;
+        /* int4 little endian */
+        if (i % 2) {
+            dest_data[out_index] = (dest_data[out_index] & 0xf) | (input_val << 4);
+        } else {
+            dest_data[out_index] = (dest_data[out_index] & 0xf0) | input_val;
+        }
+    }
+}
+
 static void nchw_uint8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                 int inner_size)
 {
@@ -732,6 +772,17 @@ static void nhwc_uint8_to_float(struct csinn_tensor *dest, struct csinn_tensor *
     }
 }
 
+static void uint8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    uint8_t *src_data = src->data;
+    float *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = uint8_to_float_base(src_data[i], src, 0);
+    }
+}
+
 static void nchw_float_to_uint8(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                 int inner_size)
 {
@@ -759,6 +810,17 @@ static void nhwc_float_to_uint8(struct csinn_tensor *dest, struct csinn_tensor *
     }
 }
 
+static void float_to_uint8(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    float *src_data = src->data;
+    uint8_t *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = float_to_uint8_base(src_data[i], dest, 0);
+    }
+}
+
 static void nchw_int8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                int inner_size)
 {
@@ -808,6 +870,17 @@ static void nc1hwc0_int8_to_float(struct csinn_tensor *dest, struct csinn_tensor
     }
 }
 
+static void int8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    int8_t *src_data = src->data;
+    float *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = int8_to_float_base(src_data[i], src, 0);
+    }
+}
+
 static void nchw_float_to_int8(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                int inner_size)
 {
@@ -836,6 +909,17 @@ static void nhwc_float_to_int8(struct csinn_tensor *dest, struct csinn_tensor *s
     }
 }
 
+static void float_to_int8(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    float *src_data = src->data;
+    int8_t *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = float_to_int8_base(src_data[i], dest, 0);
+    }
+}
+
 static void nchw_int16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                 int inner_size)
 {
@@ -864,6 +948,17 @@ static void nhwc_int16_to_float(struct csinn_tensor *dest, struct csinn_tensor *
     }
 }
 
+static void int16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    int16_t *src_data = src->data;
+    float *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = int16_to_float_base(src_data[i], src, 0);
+    }
+}
+
 static void nchw_float_to_int16(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                 int inner_size)
 {
@@ -892,6 +987,17 @@ static void nhwc_float_to_int16(struct csinn_tensor *dest, struct csinn_tensor *
     }
 }
 
+static void float_to_int16(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    float *src_data = src->data;
+    int16_t *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = float_to_int16_base(src_data[i], dest, 0);
+    }
+}
+
 static void nchw_float_to_int32(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
                                 int inner_size)
 {
@@ -948,7 +1054,29 @@ static void nhwc_int32_to_float(struct csinn_tensor *dest, struct csinn_tensor *
     }
 }
 
-static void csinn_f16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
+static void float_to_int32(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    float *src_data = src->data;
+    int32_t *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = float_to_int32_base(src_data[i], dest, 0);
+    }
+}
+
+static void int32_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    int32_t *src_data = src->data;
+    float *dest_data = dest->data;
+
+    int32_t size = csinn_tensor_size(src);
+    for (int i = 0; i < size; i++) {
+        dest_data[i] = int32_to_float_base(src_data[i], src, 0);
+    }
+}
+
+static void f16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     int16_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -964,7 +1092,7 @@ static void csinn_f16_to_float(struct csinn_tensor *dest, struct csinn_tensor *s
     }
 }
 
-static void csinn_float_to_f16(struct csinn_tensor *dest, struct csinn_tensor *src)
+static void float_to_f16(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     float *src_data = src->data;
     int16_t *dest_data = dest->data;
@@ -1080,6 +1208,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1101,6 +1230,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1121,6 +1251,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1140,6 +1271,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1159,6 +1291,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1178,6 +1311,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1197,6 +1331,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1216,6 +1351,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1235,6 +1371,7 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
             case CSINN_LAYOUT_OI:
             case CSINN_LAYOUT_OIW:
             case CSINN_LAYOUT_OIHW:
+            case CSINN_LAYOUT_IOHW:
             case CSINN_LAYOUT_OIDHW:
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
@@ -1249,9 +1386,9 @@ static int tensor_dtype_convert_weight(struct csinn_tensor *dest, struct csinn_t
                 break;
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
-        csinn_float_to_f16(dest, src);
+        float_to_f16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_FLOAT16) {
-        csinn_f16_to_float(dest, src);
+        f16_to_float(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_BFLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
         float_to_bf16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_BFLOAT16) {
@@ -1283,7 +1420,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_int4_to_float(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                int4_to_float(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_INT4 && src->dtype == CSINN_DTYPE_FLOAT32) {
@@ -1293,7 +1430,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_float_to_int4(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                float_to_int4(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_UINT8) {
@@ -1303,7 +1440,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_uint8_to_float(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                uint8_to_float(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_UINT8 && src->dtype == CSINN_DTYPE_FLOAT32) {
@@ -1313,7 +1450,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_float_to_uint8(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                float_to_uint8(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT8) {
@@ -1324,6 +1461,8 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
                 nhwc_int8_to_float(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NC1C0 && src->layout <= CSINN_LAYOUT_NC1DHWC0) {
                 nc1hwc0_int8_to_float(dest, src, n, inner_size);
+            } else {
+                int8_to_float(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_INT8 && src->dtype == CSINN_DTYPE_FLOAT32) {
@@ -1333,7 +1472,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_float_to_int8(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                float_to_int8(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT16) {
@@ -1343,7 +1482,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_int16_to_float(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                int16_to_float(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_INT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
@@ -1353,7 +1492,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_float_to_int16(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                float_to_int16(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT32) {
@@ -1363,7 +1502,7 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_int32_to_float(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                int32_to_float(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_INT32 && src->dtype == CSINN_DTYPE_FLOAT32) {
@@ -1373,13 +1512,13 @@ static int tensor_dtype_convert_activation(struct csinn_tensor *dest, struct csi
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
                 nhwc_float_to_int32(dest, src, n, inner_size);
             } else {
-                return CSINN_FALSE;
+                float_to_int32(dest, src);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
-        csinn_float_to_f16(dest, src);
+        float_to_f16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_FLOAT16) {
-        csinn_f16_to_float(dest, src);
+        f16_to_float(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_BFLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
         float_to_bf16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_BFLOAT16) {
@@ -1430,6 +1569,7 @@ static int csinn_tensor_dtype_convert(struct csinn_tensor *dest, struct csinn_te
         case CSINN_LAYOUT_OIW:
         case CSINN_LAYOUT_OWI:
         case CSINN_LAYOUT_OIHW:
+        case CSINN_LAYOUT_IOHW:
         case CSINN_LAYOUT_OHWI:
         case CSINN_LAYOUT_OIDHW:
         case CSINN_LAYOUT_ODHWI:
diff --git a/source/reference/abs.c b/source/reference/abs.c
index 19f4f3e9..73e4492c 100644
--- a/source/reference/abs.c
+++ b/source/reference/abs.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/acos.c b/source/reference/acos.c
index 202fb973..6354522e 100644
--- a/source/reference/acos.c
+++ b/source/reference/acos.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_acos_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/acosh.c b/source/reference/acosh.c
index c5efdb73..5498e802 100644
--- a/source/reference/acosh.c
+++ b/source/reference/acosh.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_acosh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/add.c b/source/reference/add.c
index 0cbaac9f..57286653 100644
--- a/source/reference/add.c
+++ b/source/reference/add.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_add_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
diff --git a/source/reference/and.c b/source/reference/and.c
index 87e7c2b4..692e02fb 100644
--- a/source/reference/and.c
+++ b/source/reference/and.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_and_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                     struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/arange.c b/source/reference/arange.c
index 23a5899f..e68dd76a 100644
--- a/source/reference/arange.c
+++ b/source/reference/arange.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_arange_f32(struct csinn_tensor *output, struct csinn_arange_params *params)
 {
diff --git a/source/reference/argmax.c b/source/reference/argmax.c
index 456a1cdb..7af83569 100644
--- a/source/reference/argmax.c
+++ b/source/reference/argmax.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 struct ArgPos {
     float value;
diff --git a/source/reference/argmin.c b/source/reference/argmin.c
index e4c5dcc0..bb5e2799 100644
--- a/source/reference/argmin.c
+++ b/source/reference/argmin.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 struct ArgPos {
     float value;
diff --git a/source/reference/asin.c b/source/reference/asin.c
index ef84571d..95be937a 100644
--- a/source/reference/asin.c
+++ b/source/reference/asin.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_asin_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/asinh.c b/source/reference/asinh.c
index eceb21c6..d225a5da 100644
--- a/source/reference/asinh.c
+++ b/source/reference/asinh.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_asinh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/atan.c b/source/reference/atan.c
index 4bded0ef..1cfa4681 100644
--- a/source/reference/atan.c
+++ b/source/reference/atan.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_atan_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/atanh.c b/source/reference/atanh.c
index ddf23c61..2c90cb2a 100644
--- a/source/reference/atanh.c
+++ b/source/reference/atanh.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_atanh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/averagepool.c b/source/reference/averagepool.c
index 3069540f..745705ed 100644
--- a/source/reference/averagepool.c
+++ b/source/reference/averagepool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_avgpool2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_pool_params *params)
diff --git a/source/reference/averagepool3d.c b/source/reference/averagepool3d.c
index 226432c0..55a56b3b 100644
--- a/source/reference/averagepool3d.c
+++ b/source/reference/averagepool3d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_avgpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                           struct csinn_pool_params *params)
diff --git a/source/reference/batch_normalization.c b/source/reference/batch_normalization.c
index 9d36c924..a37e626b 100644
--- a/source/reference/batch_normalization.c
+++ b/source/reference/batch_normalization.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/nn_impl.py#L1474-L1542
  */
diff --git a/source/reference/batch_to_space.c b/source/reference/batch_to_space.c
index 941f5b2d..a3b76a96 100644
--- a/source/reference/batch_to_space.c
+++ b/source/reference/batch_to_space.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // the input->data is a 4-D Tensor with shape [batch, depth, height, width].
 int shl_ref_batch_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output,
diff --git a/source/reference/broadcast_to.c b/source/reference/broadcast_to.c
index 87ac9071..06b03e7d 100644
--- a/source/reference/broadcast_to.c
+++ b/source/reference/broadcast_to.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_broadcast_to_params *params)
diff --git a/source/reference/cache_conv1d.c b/source/reference/cache_conv1d.c
index e4525e1b..aea9767d 100644
--- a/source/reference/cache_conv1d.c
+++ b/source/reference/cache_conv1d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_tensor *weight, struct csinn_tensor *bias,
diff --git a/source/reference/cache_matmul.c b/source/reference/cache_matmul.c
index e31e0fb5..bc1e80d4 100644
--- a/source/reference/cache_matmul.c
+++ b/source/reference/cache_matmul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // asr data buffer
 void asr_buffer_init(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth)
diff --git a/source/reference/cast.c b/source/reference/cast.c
index ce9463af..98bf7480 100644
--- a/source/reference/cast.c
+++ b/source/reference/cast.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_bytes_for_dtype(enum csinn_dtype_enum dtype)
 {
diff --git a/source/reference/ceil.c b/source/reference/ceil.c
index 8ca57c3e..b5b104e9 100644
--- a/source/reference/ceil.c
+++ b/source/reference/ceil.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_ceil_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/clip.c b/source/reference/clip.c
index 4a59873f..0c0bcd57 100644
--- a/source/reference/clip.c
+++ b/source/reference/clip.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_clip_params *params)
diff --git a/source/reference/col2im.c b/source/reference/col2im.c
index 00858a95..5d1c8179 100644
--- a/source/reference/col2im.c
+++ b/source/reference/col2im.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_col2im_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_tensor *kernel, struct csinn_col2im_params *params)
diff --git a/source/reference/concat.c b/source/reference/concat.c
index 24a50b17..1e19bdab 100644
--- a/source/reference/concat.c
+++ b/source/reference/concat.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output,
                        struct csinn_concat_params *params)
diff --git a/source/reference/convolution.c b/source/reference/convolution.c
index 7b16a728..0f3b5a0d 100644
--- a/source/reference/convolution.c
+++ b/source/reference/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 #ifdef SHL_AVX_OPT
 #include "conv_avx.h"
 #endif
diff --git a/source/reference/convolution1d.c b/source/reference/convolution1d.c
index 8efed882..5428715c 100644
--- a/source/reference/convolution1d.c
+++ b/source/reference/convolution1d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 /* TODO: direct conv1d calculation */
 int shl_ref_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
diff --git a/source/reference/convolution3d.c b/source/reference/convolution3d.c
index 06186e7d..4ce39b74 100644
--- a/source/reference/convolution3d.c
+++ b/source/reference/convolution3d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_conv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/reference/convolution_channel.c b/source/reference/convolution_channel.c
index 4f7ce69e..323f9977 100644
--- a/source/reference/convolution_channel.c
+++ b/source/reference/convolution_channel.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float shl_ref_uint8_to_float_channel(uint8_t i, float scale, int32_t zero_point)
 {
diff --git a/source/reference/convolution_relu.c b/source/reference/convolution_relu.c
index 22520465..16a654cc 100644
--- a/source/reference/convolution_relu.c
+++ b/source/reference/convolution_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_conv2d_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/reference/convolution_relu6.c b/source/reference/convolution_relu6.c
index 35b26bfc..3b239ca8 100644
--- a/source/reference/convolution_relu6.c
+++ b/source/reference/convolution_relu6.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/reference/cos.c b/source/reference/cos.c
index 34a1e542..fe410ca3 100644
--- a/source/reference/cos.c
+++ b/source/reference/cos.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_cos_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/cosh.c b/source/reference/cosh.c
index effaacd3..f2846f31 100644
--- a/source/reference/cosh.c
+++ b/source/reference/cosh.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_cosh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/cumprod.c b/source/reference/cumprod.c
index dcfaea65..092513f3 100644
--- a/source/reference/cumprod.c
+++ b/source/reference/cumprod.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_cumprod_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                         struct csinn_cumprod_params *params)
diff --git a/source/reference/cumsum.c b/source/reference/cumsum.c
index 8cc79ffd..d51b4615 100644
--- a/source/reference/cumsum.c
+++ b/source/reference/cumsum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_cumsum_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_cumsum_params *params)
diff --git a/source/reference/data_convert.c b/source/reference/data_convert.c
index 106a1e1b..654557e8 100644
--- a/source/reference/data_convert.c
+++ b/source/reference/data_convert.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_data_convert_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_siso_params *params)
diff --git a/source/reference/deconvolution.c b/source/reference/deconvolution.c
index 98a33461..25e41201 100644
--- a/source/reference/deconvolution.c
+++ b/source/reference/deconvolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int shl_ref_deconv2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/reference/deconvolution3d.c b/source/reference/deconvolution3d.c
index 3f107611..1152bab2 100644
--- a/source/reference/deconvolution3d.c
+++ b/source/reference/deconvolution3d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // input:  NCDHW
 // kernel: IODHW
diff --git a/source/reference/depth_to_space.c b/source/reference/depth_to_space.c
index 9b6fbfaa..9e5b8743 100644
--- a/source/reference/depth_to_space.c
+++ b/source/reference/depth_to_space.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // the input->data is a 4-D Tensor with shape [batch, depth, height, width].
 int shl_ref_depth_to_space_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
diff --git a/source/reference/div.c b/source/reference/div.c
index 27d35691..f220214f 100644
--- a/source/reference/div.c
+++ b/source/reference/div.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_div_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
diff --git a/source/reference/elu.c b/source/reference/elu.c
index c947309a..1302fdf0 100644
--- a/source/reference/elu.c
+++ b/source/reference/elu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float elu(float x) { return x < 0.0 ? exp(x) - 1 : x; }
 
diff --git a/source/reference/equal.c b/source/reference/equal.c
index 477718c1..ef14e573 100644
--- a/source/reference/equal.c
+++ b/source/reference/equal.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                       struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/erf.c b/source/reference/erf.c
index d954a0f5..0326814c 100644
--- a/source/reference/erf.c
+++ b/source/reference/erf.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_erf_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/exp.c b/source/reference/exp.c
index 3534edf3..77edd142 100644
--- a/source/reference/exp.c
+++ b/source/reference/exp.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_exp_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/expand_dims.c b/source/reference/expand_dims.c
index 9022518b..393c7ea4 100644
--- a/source/reference/expand_dims.c
+++ b/source/reference/expand_dims.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_expand_dims_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_expand_dims_params *params)
diff --git a/source/reference/expm1.c b/source/reference/expm1.c
index 0d62e1fb..9ef3d68e 100644
--- a/source/reference/expm1.c
+++ b/source/reference/expm1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_expm1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/flatten.c b/source/reference/flatten.c
index 962504a1..661caba1 100644
--- a/source/reference/flatten.c
+++ b/source/reference/flatten.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_reshape_params *params)
diff --git a/source/reference/floor.c b/source/reference/floor.c
index cdf511cb..1890617e 100644
--- a/source/reference/floor.c
+++ b/source/reference/floor.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_floor_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/floor_divide.c b/source/reference/floor_divide.c
index 584957b2..8e1c86e1 100644
--- a/source/reference/floor_divide.c
+++ b/source/reference/floor_divide.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_floor_divide_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                              struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/floor_mod.c b/source/reference/floor_mod.c
index e6925055..3e9e92d9 100644
--- a/source/reference/floor_mod.c
+++ b/source/reference/floor_mod.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_floor_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                           struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/fsmn.c b/source/reference/fsmn.c
index 858ea2ce..80a3193c 100644
--- a/source/reference/fsmn.c
+++ b/source/reference/fsmn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float fsmn(float x) { return x > 0 ? x : 0; }
 
diff --git a/source/reference/fullyconnected.c b/source/reference/fullyconnected.c
index 047c4b98..9169e6b1 100644
--- a/source/reference/fullyconnected.c
+++ b/source/reference/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/reference/gather.c b/source/reference/gather.c
index e66ca0ab..7aa62d37 100644
--- a/source/reference/gather.c
+++ b/source/reference/gather.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_gather_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
                        struct csinn_tensor *output, struct csinn_gather_params *params)
diff --git a/source/reference/gather_nd.c b/source/reference/gather_nd.c
index 4c30c35d..ddbdf038 100644
--- a/source/reference/gather_nd.c
+++ b/source/reference/gather_nd.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int Multiplication(int32_t *input, int s, int e)
 {
diff --git a/source/reference/global_averagepool.c b/source/reference/global_averagepool.c
index 4fcb3c0b..a7a872b8 100644
--- a/source/reference/global_averagepool.c
+++ b/source/reference/global_averagepool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
diff --git a/source/reference/global_maxpool.c b/source/reference/global_maxpool.c
index 41df8c53..acb0e715 100644
--- a/source/reference/global_maxpool.c
+++ b/source/reference/global_maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_pool_params *params)
diff --git a/source/reference/greater.c b/source/reference/greater.c
index c006c92f..95639397 100644
--- a/source/reference/greater.c
+++ b/source/reference/greater.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_greater_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                         struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/greater_equal.c b/source/reference/greater_equal.c
index 8f32b524..1a4a9603 100644
--- a/source/reference/greater_equal.c
+++ b/source/reference/greater_equal.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_greater_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                               struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/hard_sigmoid.c b/source/reference/hard_sigmoid.c
index b690e38d..e0b94219 100644
--- a/source/reference/hard_sigmoid.c
+++ b/source/reference/hard_sigmoid.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_hard_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_sigmoid_params *params)
diff --git a/source/reference/im2col.c b/source/reference/im2col.c
index 6d7ea32f..202412b8 100644
--- a/source/reference/im2col.c
+++ b/source/reference/im2col.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // input_data layout:NCHW
 // https://github.com/pjreddie/darknet/blob/master/src/im2col.c
diff --git a/source/reference/instance_norm.c b/source/reference/instance_norm.c
index 342acffa..84eece20 100644
--- a/source/reference/instance_norm.c
+++ b/source/reference/instance_norm.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_instance_norm_f32(struct csinn_tensor *input, struct csinn_tensor *scales,
                               struct csinn_tensor *bias, struct csinn_tensor *output,
diff --git a/source/reference/isnan.c b/source/reference/isnan.c
index c3408d4a..8a98c9ad 100644
--- a/source/reference/isnan.c
+++ b/source/reference/isnan.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_isnan_bool_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_siso_params *params)
diff --git a/source/reference/l2_normalization.c b/source/reference/l2_normalization.c
index b3045c3e..ba7c57d1 100644
--- a/source/reference/l2_normalization.c
+++ b/source/reference/l2_normalization.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 /* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/l2normalization.h
  */
diff --git a/source/reference/l2pool.c b/source/reference/l2pool.c
index 82002a97..34edcaa9 100644
--- a/source/reference/l2pool.c
+++ b/source/reference/l2pool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_l2pool_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_pool_params *params)
diff --git a/source/reference/layer_norm.c b/source/reference/layer_norm.c
index c1d3ee1c..c9ca2b93 100644
--- a/source/reference/layer_norm.c
+++ b/source/reference/layer_norm.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_layer_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_tensor *gamma, struct csinn_tensor *beta,
diff --git a/source/reference/leaky_relu.c b/source/reference/leaky_relu.c
index 9414181a..36ffae84 100644
--- a/source/reference/leaky_relu.c
+++ b/source/reference/leaky_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_relu_params *params)
diff --git a/source/reference/less.c b/source/reference/less.c
index befd03e1..ce0b2fbf 100644
--- a/source/reference/less.c
+++ b/source/reference/less.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_less_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/less_equal.c b/source/reference/less_equal.c
index 8f40c0aa..cec0513b 100644
--- a/source/reference/less_equal.c
+++ b/source/reference/less_equal.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_less_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                            struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/log.c b/source/reference/log.c
index 20f0c83c..bf59cea1 100644
--- a/source/reference/log.c
+++ b/source/reference/log.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_log_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/log1p.c b/source/reference/log1p.c
index 4849e1d6..8eefa2ee 100644
--- a/source/reference/log1p.c
+++ b/source/reference/log1p.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_log1p_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/log_softmax.c b/source/reference/log_softmax.c
index 914132c8..2412fc52 100644
--- a/source/reference/log_softmax.c
+++ b/source/reference/log_softmax.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 /* logsoftmax = logits - log(reduce_sum(exp(logits), axis)) */
 int shl_ref_log_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output,
diff --git a/source/reference/logical_and.c b/source/reference/logical_and.c
index 786473d4..19e389b5 100644
--- a/source/reference/logical_and.c
+++ b/source/reference/logical_and.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_logical_and_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                             struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/logical_not.c b/source/reference/logical_not.c
index 1819abd4..1e3717de 100644
--- a/source/reference/logical_not.c
+++ b/source/reference/logical_not.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_logical_not_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_siso_params *params)
diff --git a/source/reference/logical_or.c b/source/reference/logical_or.c
index 945c1f35..58ba063f 100644
--- a/source/reference/logical_or.c
+++ b/source/reference/logical_or.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_logical_or_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                            struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/logical_xor.c b/source/reference/logical_xor.c
index b626bea8..6d233f2c 100644
--- a/source/reference/logical_xor.c
+++ b/source/reference/logical_xor.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_logical_xor_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                             struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/lrn.c b/source/reference/lrn.c
index d4709dfc..3343e781 100644
--- a/source/reference/lrn.c
+++ b/source/reference/lrn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int shl_ref_lrn_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_lrn_params *params)
diff --git a/source/reference/matmul.c b/source/reference/matmul.c
index 01993c6f..aa7ca2eb 100644
--- a/source/reference/matmul.c
+++ b/source/reference/matmul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_matmul_f32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                        struct csinn_tensor *output, struct csinn_matmul_params *params)
diff --git a/source/reference/max.c b/source/reference/max.c
index 6a8387ba..e70931c6 100644
--- a/source/reference/max.c
+++ b/source/reference/max.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_max_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_reduce_params *params)
diff --git a/source/reference/maximum.c b/source/reference/maximum.c
index 2666e7ba..282f876e 100644
--- a/source/reference/maximum.c
+++ b/source/reference/maximum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_maximum_f32(float *src0, float *src1, float *dest, int input_idx,
                                 int output_idx)
diff --git a/source/reference/maxpool.c b/source/reference/maxpool.c
index 205a0a36..a76f18a6 100644
--- a/source/reference/maxpool.c
+++ b/source/reference/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int shl_ref_maxpool2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_pool_params *params)
diff --git a/source/reference/maxpool2d_locat.c b/source/reference/maxpool2d_locat.c
index 4fbc2160..6e1c66bb 100644
--- a/source/reference/maxpool2d_locat.c
+++ b/source/reference/maxpool2d_locat.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int shl_ref_maxpool2d_locat_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                             struct csinn_pool_params *params)
diff --git a/source/reference/maxpool3d.c b/source/reference/maxpool3d.c
index 35c69ac2..f0639c27 100644
--- a/source/reference/maxpool3d.c
+++ b/source/reference/maxpool3d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_maxpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                           struct csinn_pool_params *params)
diff --git a/source/reference/mean.c b/source/reference/mean.c
index 8ccaf372..b74f80c3 100644
--- a/source/reference/mean.c
+++ b/source/reference/mean.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_mean_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_reduce_params *params)
diff --git a/source/reference/min.c b/source/reference/min.c
index d4e118d1..70a82433 100644
--- a/source/reference/min.c
+++ b/source/reference/min.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_min_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_reduce_params *params)
diff --git a/source/reference/minimum.c b/source/reference/minimum.c
index b68e0218..ba547356 100644
--- a/source/reference/minimum.c
+++ b/source/reference/minimum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_minimum_f32(float *src0, float *src1, float *dest, int input_idx,
                                 int output_idx)
diff --git a/source/reference/mod.c b/source/reference/mod.c
index 1c4e3247..1d3a2ce0 100644
--- a/source/reference/mod.c
+++ b/source/reference/mod.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_mod_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
diff --git a/source/reference/mul.c b/source/reference/mul.c
index 781f2d65..16d1e580 100644
--- a/source/reference/mul.c
+++ b/source/reference/mul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_mul_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
diff --git a/source/reference/ndarray_size.c b/source/reference/ndarray_size.c
index 8ba447b0..a32fe987 100644
--- a/source/reference/ndarray_size.c
+++ b/source/reference/ndarray_size.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_ndarray_size_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_ndarray_size_params *params)
diff --git a/source/reference/negative.c b/source/reference/negative.c
index 4320bda1..7f9016da 100644
--- a/source/reference/negative.c
+++ b/source/reference/negative.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_negative_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_siso_params *params)
diff --git a/source/reference/non_max_suppression.c b/source/reference/non_max_suppression.c
index a9b6e3d0..6314ee93 100644
--- a/source/reference/non_max_suppression.c
+++ b/source/reference/non_max_suppression.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int find_max_score_idx(const float *scores, int *flag, int len)
 {
diff --git a/source/reference/not.c b/source/reference/not.c
index 930dc829..4b6a8733 100644
--- a/source/reference/not.c
+++ b/source/reference/not.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_not_u32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/not_equal.c b/source/reference/not_equal.c
index 47991e9f..2baef40a 100644
--- a/source/reference/not_equal.c
+++ b/source/reference/not_equal.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_not_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                           struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/one_hot.c b/source/reference/one_hot.c
index 66a818c3..b2b1ee10 100644
--- a/source/reference/one_hot.c
+++ b/source/reference/one_hot.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 /* from tensorflow/lite/kernels/one_hot.cc */
 
diff --git a/source/reference/or.c b/source/reference/or.c
index 36940da3..fdc404ed 100644
--- a/source/reference/or.c
+++ b/source/reference/or.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_or_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                    struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/pad.c b/source/reference/pad.c
index 433a59c9..6faf2883 100644
--- a/source/reference/pad.c
+++ b/source/reference/pad.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int shl_ref_pad_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pad_params *params)
diff --git a/source/reference/power.c b/source/reference/power.c
index c47b1fd2..3c71a10a 100644
--- a/source/reference/power.c
+++ b/source/reference/power.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_pow_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
diff --git a/source/reference/prelu.c b/source/reference/prelu.c
index 7337a8f0..51f7e6ac 100644
--- a/source/reference/prelu.c
+++ b/source/reference/prelu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
                       struct csinn_tensor *output, struct csinn_prelu_params *params)
diff --git a/source/reference/prod.c b/source/reference/prod.c
index fd5fea73..4f6b6462 100644
--- a/source/reference/prod.c
+++ b/source/reference/prod.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_prod_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_reduce_params *params)
diff --git a/source/reference/proposal.c b/source/reference/proposal.c
index 48c024d6..6a4ecfba 100644
--- a/source/reference/proposal.c
+++ b/source/reference/proposal.c
@@ -18,7 +18,7 @@
 
 #include <math.h>
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 #define MAX(a, b) (a > b ? a : b)
 #define MIN(a, b) (a > b ? b : a)
 
diff --git a/source/reference/psroipooling.c b/source/reference/psroipooling.c
index c84b4b9f..253d3e4d 100644
--- a/source/reference/psroipooling.c
+++ b/source/reference/psroipooling.c
@@ -18,7 +18,7 @@
 
 #include <math.h>
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_psroipooling_f32(struct csinn_tensor *data, struct csinn_tensor *rois,
                              struct csinn_tensor *output, struct csinn_psroipooling_params *params)
diff --git a/source/reference/reduce_logsumexp.c b/source/reference/reduce_logsumexp.c
index c0908d82..654f828b 100644
--- a/source/reference/reduce_logsumexp.c
+++ b/source/reference/reduce_logsumexp.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_reduce_logsumexp_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                  struct csinn_reduce_params *params)
diff --git a/source/reference/reduce_max.c b/source/reference/reduce_max.c
index a678d723..78793448 100644
--- a/source/reference/reduce_max.c
+++ b/source/reference/reduce_max.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_reduce_max_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_reduce_params *params)
diff --git a/source/reference/reduce_mean.c b/source/reference/reduce_mean.c
index 0f4838b0..d5ccdab9 100644
--- a/source/reference/reduce_mean.c
+++ b/source/reference/reduce_mean.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_reduce_mean_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_reduce_params *params)
diff --git a/source/reference/reduce_min.c b/source/reference/reduce_min.c
index 0b9f39ff..23be996e 100644
--- a/source/reference/reduce_min.c
+++ b/source/reference/reduce_min.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_reduce_min_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_reduce_params *params)
diff --git a/source/reference/reduce_prod.c b/source/reference/reduce_prod.c
index 871625a7..e321c80f 100644
--- a/source/reference/reduce_prod.c
+++ b/source/reference/reduce_prod.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_reduce_prod_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_reduce_params *params)
diff --git a/source/reference/reduce_sum.c b/source/reference/reduce_sum.c
index 7d8d0ec8..67869ed4 100644
--- a/source/reference/reduce_sum.c
+++ b/source/reference/reduce_sum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_reduce_sum_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_reduce_params *params)
diff --git a/source/reference/relu.c b/source/reference/relu.c
index 22a27ea6..9d614d84 100644
--- a/source/reference/relu.c
+++ b/source/reference/relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float relu(float x) { return x > 0 ? x : 0; }
 
diff --git a/source/reference/relu1.c b/source/reference/relu1.c
index 6bb33005..d844d842 100644
--- a/source/reference/relu1.c
+++ b/source/reference/relu1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float relu1(float x) { return fmin(x > 0 ? x : 0, 1); }
 
diff --git a/source/reference/relu6.c b/source/reference/relu6.c
index d0fc4952..bb79568f 100644
--- a/source/reference/relu6.c
+++ b/source/reference/relu6.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float relu6(float x) { return fmin(x > 0 ? x : 0, 6); }
 
diff --git a/source/reference/relun.c b/source/reference/relun.c
index 00702f74..d00bc102 100644
--- a/source/reference/relun.c
+++ b/source/reference/relun.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float relun(float x, float y) { return fmin(x > 0.0 ? x : 0.0, y); }
 
diff --git a/source/reference/reshape.c b/source/reference/reshape.c
index 36b8667e..b14e5a8f 100644
--- a/source/reference/reshape.c
+++ b/source/reference/reshape.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_reshape_params *params)
diff --git a/source/reference/resize.c b/source/reference/resize.c
index 20909f0d..06f725e7 100644
--- a/source/reference/resize.c
+++ b/source/reference/resize.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void shl_ref_resize_bilinear_nhwc_f32(struct csinn_tensor *input,
                                              struct csinn_tensor *output, bool align_corners)
diff --git a/source/reference/reverse.c b/source/reference/reverse.c
index 55945f54..3ba1bcbf 100644
--- a/source/reference/reverse.c
+++ b/source/reference/reverse.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int Multiplication(struct csinn_tensor *input, int s, int e)
 {
diff --git a/source/reference/roialign.c b/source/reference/roialign.c
index bcd079ee..e1d39aec 100644
--- a/source/reference/roialign.c
+++ b/source/reference/roialign.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // https://github.com/AceCoooool/RoIAlign-RoIPool-pytorch/blob/master/roialign/roi_align_cpu.cpp
 
diff --git a/source/reference/roipool.c b/source/reference/roipool.c
index 27706873..d0dd12e9 100644
--- a/source/reference/roipool.c
+++ b/source/reference/roipool.c
@@ -18,7 +18,7 @@
 
 #include <math.h>
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // https://github.com/pytorch/pytorch/blob/master/caffe2/operators/roi_pool_op.cc
 // defalut input layout: NCHW
diff --git a/source/reference/round.c b/source/reference/round.c
index fe286ad0..9076c450 100644
--- a/source/reference/round.c
+++ b/source/reference/round.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_round_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/rsqrt.c b/source/reference/rsqrt.c
index 5f4cc78e..05c08c12 100644
--- a/source/reference/rsqrt.c
+++ b/source/reference/rsqrt.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_rsqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/scatter.c b/source/reference/scatter.c
index b6b7ce25..cea82a6a 100644
--- a/source/reference/scatter.c
+++ b/source/reference/scatter.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_scatter_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
                            struct csinn_tensor *updates, struct csinn_tensor *output,
diff --git a/source/reference/segment_max.c b/source/reference/segment_max.c
index 77b38285..41f7edba 100644
--- a/source/reference/segment_max.c
+++ b/source/reference/segment_max.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_unsorted_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
                                      struct csinn_tensor *output,
diff --git a/source/reference/segment_mean.c b/source/reference/segment_mean.c
index 4d0153bc..eb39b17f 100644
--- a/source/reference/segment_mean.c
+++ b/source/reference/segment_mean.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_unsorted_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
                                       struct csinn_tensor *output,
diff --git a/source/reference/segment_min.c b/source/reference/segment_min.c
index 24b7a217..6c700b23 100644
--- a/source/reference/segment_min.c
+++ b/source/reference/segment_min.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_unsorted_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
                                      struct csinn_tensor *output,
diff --git a/source/reference/segment_prod.c b/source/reference/segment_prod.c
index 2cd1dee7..a96ec4d7 100644
--- a/source/reference/segment_prod.c
+++ b/source/reference/segment_prod.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_unsorted_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
                                       struct csinn_tensor *output,
diff --git a/source/reference/segment_sum.c b/source/reference/segment_sum.c
index bf814653..d7d45962 100644
--- a/source/reference/segment_sum.c
+++ b/source/reference/segment_sum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_unsorted_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
                                      struct csinn_tensor *output,
diff --git a/source/reference/select.c b/source/reference/select.c
index 3cfa3bb4..bd71c237 100644
--- a/source/reference/select.c
+++ b/source/reference/select.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_select_f32(struct csinn_tensor *condition, struct csinn_tensor *input0,
                        struct csinn_tensor *input1, struct csinn_tensor *output,
diff --git a/source/reference/setup.c b/source/reference/setup.c
index dfd49f9b..09e046f1 100644
--- a/source/reference/setup.c
+++ b/source/reference/setup.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output)
 {
@@ -675,8 +675,8 @@ static void *setup_cb_map()
     }
 
 #ifndef CONFIG_C_REFERENCE_RESHAPE_DISABLED
-        cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_INT64].exec = shl_ref_reshape;
-        cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_INT64].init = shl_ref_reshape_init;
+    cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_INT64].exec = shl_ref_reshape;
+    cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_INT64].init = shl_ref_reshape_init;
 #endif
 
 #ifndef CONFIG_C_REFERENCE_CONCAT_DISABLED
diff --git a/source/reference/shape.c b/source/reference/shape.c
index 8ebb727b..a23f20ad 100644
--- a/source/reference/shape.c
+++ b/source/reference/shape.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_shape_i32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_shape_params *params)
diff --git a/source/reference/shuffle_channel.c b/source/reference/shuffle_channel.c
index 4d4eee23..f14c188c 100644
--- a/source/reference/shuffle_channel.c
+++ b/source/reference/shuffle_channel.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int shl_ref_shuffle_channel_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                             struct csinn_shuffle_channel_params *params)
diff --git a/source/reference/sigmoid.c b/source/reference/sigmoid.c
index 061dba0b..18e2d9ce 100644
--- a/source/reference/sigmoid.c
+++ b/source/reference/sigmoid.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                         struct csinn_sigmoid_params *params)
diff --git a/source/reference/sign.c b/source/reference/sign.c
index 6dc81e95..0e76050a 100644
--- a/source/reference/sign.c
+++ b/source/reference/sign.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 float sign(float v)
 {
diff --git a/source/reference/sin.c b/source/reference/sin.c
index 4da2270b..f8d1d374 100644
--- a/source/reference/sin.c
+++ b/source/reference/sin.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_sin_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/sinh.c b/source/reference/sinh.c
index 6ee3b6d7..9548f1f2 100644
--- a/source/reference/sinh.c
+++ b/source/reference/sinh.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_sinh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/slice.c b/source/reference/slice.c
index a507346c..ac346ada 100644
--- a/source/reference/slice.c
+++ b/source/reference/slice.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_slice_params *params)
diff --git a/source/reference/softmax.c b/source/reference/softmax.c
index aefb51b9..1553fda9 100644
--- a/source/reference/softmax.c
+++ b/source/reference/softmax.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                         struct csinn_softmax_params *params)
diff --git a/source/reference/softplus.c b/source/reference/softplus.c
index 56db2bf9..41490331 100644
--- a/source/reference/softplus.c
+++ b/source/reference/softplus.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_softplus_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_siso_params *params)
diff --git a/source/reference/softrelu.c b/source/reference/softrelu.c
index 17b57a5a..94ad1f5d 100644
--- a/source/reference/softrelu.c
+++ b/source/reference/softrelu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float softrelu(float x, float y) { return log(1 + exp(fmax(fmin(x, y), y))); }
 
diff --git a/source/reference/softsign.c b/source/reference/softsign.c
index cf597c57..36665db2 100644
--- a/source/reference/softsign.c
+++ b/source/reference/softsign.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_softsign_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_siso_params *params)
diff --git a/source/reference/space_to_batch.c b/source/reference/space_to_batch.c
index d6063292..68aa0f64 100644
--- a/source/reference/space_to_batch.c
+++ b/source/reference/space_to_batch.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // tf.nn.space_to_batch:the input mast a  4-D Tensor with shape [batch, height, width, depth].
 
diff --git a/source/reference/space_to_depth.c b/source/reference/space_to_depth.c
index 193c6143..a3daecec 100644
--- a/source/reference/space_to_depth.c
+++ b/source/reference/space_to_depth.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 // the input->data is a 4-D Tensor with shape [batch, depth, height, width].
 int shl_ref_space_to_depth_f32(struct csinn_tensor *input, struct csinn_tensor *output,
diff --git a/source/reference/split.c b/source/reference/split.c
index 3012ba0b..9344bb34 100644
--- a/source/reference/split.c
+++ b/source/reference/split.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_split_f32(struct csinn_tensor *input, struct csinn_tensor **output,
                       struct csinn_split_params *params)
diff --git a/source/reference/sqrt.c b/source/reference/sqrt.c
index d8eecf78..d9a1732e 100644
--- a/source/reference/sqrt.c
+++ b/source/reference/sqrt.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_sqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/square.c b/source/reference/square.c
index fcc1f34a..08ac16b2 100644
--- a/source/reference/square.c
+++ b/source/reference/square.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_square_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_siso_params *params)
diff --git a/source/reference/squeeze.c b/source/reference/squeeze.c
index f69d6768..41e9e376 100644
--- a/source/reference/squeeze.c
+++ b/source/reference/squeeze.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_squeeze_params *params)
diff --git a/source/reference/stack.c b/source/reference/stack.c
index 3ffb54da..a346b793 100644
--- a/source/reference/stack.c
+++ b/source/reference/stack.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_stack_f32(struct csinn_tensor **input, struct csinn_tensor *output,
                       struct csinn_stack_params *params)
diff --git a/source/reference/strided_slice.c b/source/reference/strided_slice.c
index 6ec070a9..76f5a523 100644
--- a/source/reference/strided_slice.c
+++ b/source/reference/strided_slice.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 /* fixme: */
 int shl_ref_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -103,7 +103,7 @@ int shl_ref_strided_slice_f32(struct csinn_tensor *input, struct csinn_tensor *o
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
 
-    for (int i = 0; i < params->slice_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         if (params->begin[i] < -input->dim[i]) params->begin[i] = -input->dim[i];
         if (params->begin[i] < 0) params->begin[i] += input->dim[i];
         if (params->begin[i] > input->dim[i]) params->begin[i] = input->dim[i];
diff --git a/source/reference/sub.c b/source/reference/sub.c
index 30e3cf09..09fa6a20 100644
--- a/source/reference/sub.c
+++ b/source/reference/sub.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static void element_sub_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
diff --git a/source/reference/sum.c b/source/reference/sum.c
index f213d960..62cfb9a7 100644
--- a/source/reference/sum.c
+++ b/source/reference/sum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_sum_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_reduce_params *params)
diff --git a/source/reference/tan.c b/source/reference/tan.c
index 9b9e83c9..054b5892 100644
--- a/source/reference/tan.c
+++ b/source/reference/tan.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_tan_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                     struct csinn_siso_params *params)
diff --git a/source/reference/tanh.c b/source/reference/tanh.c
index fd5bbd01..fc98b9c4 100644
--- a/source/reference/tanh.c
+++ b/source/reference/tanh.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_tanh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                      struct csinn_siso_params *params)
diff --git a/source/reference/threshold_relu.c b/source/reference/threshold_relu.c
index 0b34d1a0..5248aee3 100644
--- a/source/reference/threshold_relu.c
+++ b/source/reference/threshold_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static float threshold_relu(float x, float theta) { return x > theta ? x : 0; }
 
diff --git a/source/reference/tile.c b/source/reference/tile.c
index 96b256d1..2c0f222c 100644
--- a/source/reference/tile.c
+++ b/source/reference/tile.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int Multiplication(int32_t *dim, int s, int e)
 {
diff --git a/source/reference/topk.c b/source/reference/topk.c
index a9f462bb..90694c05 100644
--- a/source/reference/topk.c
+++ b/source/reference/topk.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_topk_f32(struct csinn_tensor *input, struct csinn_tensor *output1,
                      struct csinn_tensor *output2, struct csinn_topk_params *params)
diff --git a/source/reference/transpose.c b/source/reference/transpose.c
index 39e68ca9..1744f82e 100644
--- a/source/reference/transpose.c
+++ b/source/reference/transpose.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_transpose_params *params)
diff --git a/source/reference/trunc.c b/source/reference/trunc.c
index 914b8667..c062e513 100644
--- a/source/reference/trunc.c
+++ b/source/reference/trunc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_trunc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_siso_params *params)
diff --git a/source/reference/unpooling.c b/source/reference/unpooling.c
index b6c45c6b..611ba5ae 100644
--- a/source/reference/unpooling.c
+++ b/source/reference/unpooling.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 static int shl_ref_unpooling_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *mask,
                                       struct csinn_tensor *output,
diff --git a/source/reference/unstack.c b/source/reference/unstack.c
index 3ea1198d..c5e6fb82 100644
--- a/source/reference/unstack.c
+++ b/source/reference/unstack.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_unstack_f32(struct csinn_tensor *input, struct csinn_tensor **output,
                         struct csinn_unstack_params *params)
diff --git a/source/reference/utils.c b/source/reference/utils.c
index c376b134..4f7e00b0 100644
--- a/source/reference/utils.c
+++ b/source/reference/utils.c
@@ -18,7 +18,7 @@
 
 #include <time.h>
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int32_t shl_ref_max_internal_s32(int32_t a, int32_t b)
 {
diff --git a/source/reference/where.c b/source/reference/where.c
index c867bf72..49f27d61 100644
--- a/source/reference/where.c
+++ b/source/reference/where.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_where_f32(struct csinn_tensor *condition, struct csinn_tensor *x,
                       struct csinn_tensor *y, struct csinn_tensor *output,
diff --git a/source/reference/where_softmax.c b/source/reference/where_softmax.c
index 052bb36a..0b8c6d74 100644
--- a/source/reference/where_softmax.c
+++ b/source/reference/where_softmax.c
@@ -18,7 +18,7 @@
 
 #include <math.h>
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_where_softmax_f32(struct csinn_tensor *condition, struct csinn_tensor *y,
                               struct csinn_tensor *output,
diff --git a/source/reference/xor.c b/source/reference/xor.c
index 57c92c8b..3178de6b 100644
--- a/source/reference/xor.c
+++ b/source/reference/xor.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 int shl_ref_xor_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                     struct csinn_tensor *output, struct csinn_diso_params *params)
diff --git a/source/reference/yuv_rgb_scale.c b/source/reference/yuv_rgb_scale.c
index 2d8e0838..c40007f6 100644
--- a/source/reference/yuv_rgb_scale.c
+++ b/source/reference/yuv_rgb_scale.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_ref.h"
+#include "reference/ref.h"
 
 /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L3279-L3325
  * line 3279*/
diff --git a/source/thead_matrix/avgpool.c b/source/thead_matrix/avgpool.c
index 73a3f6a7..e65399a1 100644
--- a/source/thead_matrix/avgpool.c
+++ b/source/thead_matrix/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 int shl_rvm_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
diff --git a/source/thead_matrix/convolution.c b/source/thead_matrix/convolution.c
index 8e880e37..5b055dde 100644
--- a/source/thead_matrix/convolution.c
+++ b/source/thead_matrix/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 int shl_rvm_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -30,8 +30,8 @@ int shl_rvm_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     int32_t group = params->group;
     const int mcols = csrr_xrlenb() / 2;
     struct csinn_callback *cb = params->base.cb;
@@ -41,14 +41,14 @@ int shl_rvm_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
         has_reordered = true;
     }
 
-    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-        dalition_w == 1) {
+    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+        dilation_w == 1) {
         if (!has_reordered) {
             shl_rvm_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
         }
         cb->exec = shl_rvm_conv1x1s1_gemm_fp16;
     } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-               dalition_h == 1 && dalition_w == 1 && group == 1) {
+               dilation_h == 1 && dilation_w == 1 && group == 1) {
         params->conv_extra.conv_mode = CSINN_WINOGRAD;
         struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
         if ((in_h < 13) && (in_w < 13)) {
@@ -80,8 +80,8 @@ int shl_rvm_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     int32_t group = params->group;
     struct csinn_callback *cb = params->base.cb;
     bool has_reordered = false;
@@ -118,8 +118,8 @@ int shl_rvm_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
                                 &(kernel->qinfo[i].shift));
     }
 
-    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-        dalition_w == 1) {
+    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+        dilation_w == 1) {
         if (!has_reordered) {
             shl_rvm_conv1x1s1_gemm_reorder_kernel_int8(kernel, params);
         }
diff --git a/source/thead_matrix/convolution_1x1_fp16_matrix.c b/source/thead_matrix/convolution_1x1_fp16_matrix.c
index 5fea5e35..89432755 100644
--- a/source/thead_matrix/convolution_1x1_fp16_matrix.c
+++ b/source/thead_matrix/convolution_1x1_fp16_matrix.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 void shl_rvm_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params)
diff --git a/source/thead_matrix/convolution_1x1_int8_matrix.c b/source/thead_matrix/convolution_1x1_int8_matrix.c
index b0e17bbe..7bb041cb 100644
--- a/source/thead_matrix/convolution_1x1_int8_matrix.c
+++ b/source/thead_matrix/convolution_1x1_int8_matrix.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 void shl_rvm_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params)
diff --git a/source/thead_matrix/convolution_3x3_fp16_matrix.c b/source/thead_matrix/convolution_3x3_fp16_matrix.c
index fc485b2f..4f944bf9 100644
--- a/source/thead_matrix/convolution_3x3_fp16_matrix.c
+++ b/source/thead_matrix/convolution_3x3_fp16_matrix.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 static inline void wg_bxf3s1_reorder_kernel_nhwc_fp16(__fp16 *dst, const __fp16 *src, int N, int K,
                                                       int mcols)
diff --git a/source/thead_matrix/convolution_gemm_fp16_matrix.c b/source/thead_matrix/convolution_gemm_fp16_matrix.c
index 1cbdc7bb..b307d65e 100644
--- a/source/thead_matrix/convolution_gemm_fp16_matrix.c
+++ b/source/thead_matrix/convolution_gemm_fp16_matrix.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 static void im2col_gemm_reorder_kernel_nhwc_per_group_fp16(__fp16 *src, __fp16 *dst, int out_c,
                                                            int in_c, int maxk)
diff --git a/source/thead_matrix/convolution_gemm_int8_matrix.c b/source/thead_matrix/convolution_gemm_int8_matrix.c
index 8c881dc7..4bc16217 100644
--- a/source/thead_matrix/convolution_gemm_int8_matrix.c
+++ b/source/thead_matrix/convolution_gemm_int8_matrix.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 static void im2col_gemm_reorder_kernel_per_group_int8_matrix(int8_t *src, int8_t *dst, int out_c,
                                                              int in_c, int maxk)
diff --git a/source/thead_matrix/depthwise_convolution.c b/source/thead_matrix/depthwise_convolution.c
index 35dfd6f4..d63a71f6 100644
--- a/source/thead_matrix/depthwise_convolution.c
+++ b/source/thead_matrix/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 int shl_rvm_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/thead_matrix/gemm_fp16_matrix.c b/source/thead_matrix/gemm_fp16_matrix.c
index 0dd5448c..b385969a 100644
--- a/source/thead_matrix/gemm_fp16_matrix.c
+++ b/source/thead_matrix/gemm_fp16_matrix.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 void gemm_fp16_nhwc_matrix_rowxn(__fp16 *output, const __fp16 *kernel, const __fp16 *input,
                                  const __fp16 *bias, int row, int k, int n);
diff --git a/source/thead_matrix/gemm_fp16_matrix_intrinsic.c b/source/thead_matrix/gemm_fp16_matrix_intrinsic.c
index ac591baa..36d4e3f5 100644
--- a/source/thead_matrix/gemm_fp16_matrix_intrinsic.c
+++ b/source/thead_matrix/gemm_fp16_matrix_intrinsic.c
@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
-
+#include "rvm/rvm.h"
+#if 0
 static void gemm_fp16_nhwc_matrix_2rowxn(__fp16 *output, const __fp16 *kernel, const __fp16 *input,
                                          const __fp16 *bias, int mrows, int K, int N)
 {
@@ -274,3 +274,4 @@ void shl_rvm_nhwc_gemm_fp16_intrinsic(__fp16 *dst, const __fp16 *sa, const __fp1
         bias_shadow = NULL;
     }
 }
+#endif
\ No newline at end of file
diff --git a/source/thead_matrix/gemm_int8_matrix.c b/source/thead_matrix/gemm_int8_matrix.c
index 150c1f18..8b893931 100644
--- a/source/thead_matrix/gemm_int8_matrix.c
+++ b/source/thead_matrix/gemm_int8_matrix.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 #ifndef MATRIX_PW_I32
 static void requantize_m4_nhwc(int8_t *dst, int32_t *src, int row, int col, int32_t out_zp,
diff --git a/source/thead_matrix/maxpool.c b/source/thead_matrix/maxpool.c
index 83af4df5..ea38f66f 100644
--- a/source/thead_matrix/maxpool.c
+++ b/source/thead_matrix/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 int shl_rvm_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
diff --git a/source/thead_matrix/setup.c b/source/thead_matrix/setup.c
index 531e6ada..b89437a6 100644
--- a/source/thead_matrix/setup.c
+++ b/source/thead_matrix/setup.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 #define RVM_OP_PATTERN_MAX 60
 static struct shl_cb_table shl_rvm_cb_table[RVM_OP_PATTERN_MAX];
diff --git a/source/thead_matrix/utils.c b/source/thead_matrix/utils.c
index db731114..0d74a4e2 100644
--- a/source/thead_matrix/utils.c
+++ b/source/thead_matrix/utils.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvm.h"
+#include "rvm/rvm.h"
 
 int csrr_xrlenb()
 {
diff --git a/source/thead_rvv/CMakeLists.txt b/source/thead_rvv/CMakeLists.txt
index 923b0071..8b16b4e8 100644
--- a/source/thead_rvv/CMakeLists.txt
+++ b/source/thead_rvv/CMakeLists.txt
@@ -5,6 +5,7 @@ if(CONFIG_THEAD_RVV_SOURCE)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/reorder.c)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/data_convert.c)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/capability.c)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/binary_broadcast.c)
 endif()
 
 if(CONFIG_THEAD_RVV_ADD_FP32)
@@ -120,6 +121,16 @@ if(CONFIG_THEAD_RVV_CONVOLUTION_INT4)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int4/convolution.c)
 endif()
 
+if(CONFIG_THEAD_RVV_CONVOLUTION1D_FP32)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/convolution1d.c)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/convolution1d_gemm_fp32.c)
+endif()
+
+if(CONFIG_THEAD_RVV_CONVOLUTION1D_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/convolution1d.c)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/convolution1d_gemm_fp16.c)
+endif()
+
 if(CONFIG_THEAD_RVV_CONVOLUTION1D_INT8)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/convolution1d.c)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/convolution1d_1_int8.c)
@@ -158,6 +169,40 @@ if(CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/depthwise_convolution1d_int8.c)
 endif()
 
+if(CONFIG_THEAD_RVV_DECONVOLUTION_FP32)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/deconvolution_gemm_fp32.c)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/deconvolution.c)
+endif()
+
+if(CONFIG_THEAD_RVV_DECONVOLUTION_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/deconvolution_gemm_fp16.c)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/deconvolution.c)
+endif()
+
+if(CONFIG_THEAD_RVV_DIV_FP32)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/div.c)
+endif()
+
+if(CONFIG_THEAD_RVV_DIV_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/div.c)
+endif()
+
+if(CONFIG_THEAD_RVV_DIV_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/div.c)
+endif()
+
+if(CONFIG_THEAD_RVV_ERF_FP32)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/erf.c)
+endif()
+
+if(CONFIG_THEAD_RVV_ERF_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/erf.c)
+endif()
+
+if(CONFIG_THEAD_RVV_ERF_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/erf.c)
+endif()
+
 if(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/fullyconnected_fp32.c)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/fullyconnected.c)
@@ -177,6 +222,18 @@ if(CONFIG_THEAD_RVV_FULLYCONNECTED_INT4)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int4/fullyconnected_int4.c)
 endif()
 
+if(CONFIG_THEAD_RVV_GATHER_FP32)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/gather.c)
+endif()
+
+if(CONFIG_THEAD_RVV_GATHER_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/gather.c)
+endif()
+
+if(CONFIG_THEAD_RVV_GATHER_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/gather.c)
+endif()
+
 if(CONFIG_THEAD_RVV_GEMM_FP32)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/gemm_fp32_packn.c)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/gemm_fp32.c)
@@ -269,7 +326,9 @@ if(CONFIG_THEAD_RVV_MATMUL_FP16)
 endif()
 
 if(CONFIG_THEAD_RVV_MATMUL_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/matmul.c)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/matmul_int8.c)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/matmul_int8_dot.c)
 endif()
 
 if(CONFIG_THEAD_RVV_MAXPOOL_FP32)
@@ -338,6 +397,10 @@ if(CONFIG_THEAD_RVV_PRELU_INT8)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/prelu.c)
 endif()
 
+if(CONFIG_THEAD_RVV_REDUCE_SUM_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/reduce_sum.c)
+endif()
+
 if(CONFIG_THEAD_RVV_RELU_FP32)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/relu.c)
 endif()
@@ -358,6 +421,10 @@ if(CONFIG_THEAD_RVV_RELU6_FP16)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/relu6.c)
 endif()
 
+if(CONFIG_THEAD_RVV_RELU6_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/relu6.c)
+endif()
+
 if(CONFIG_THEAD_RVV_RESHAPE_FP32)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/reshape.c)
 endif()
@@ -378,6 +445,22 @@ if(CONFIG_THEAD_RVV_SIGMOID_FP16)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/sigmoid.c)
 endif()
 
+if(CONFIG_THEAD_RVV_SIGMOID_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/sigmoid.c)
+endif()
+
+if(CONFIG_THEAD_RVV_SUB_FP32)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/sub.c)
+endif()
+
+if(CONFIG_THEAD_RVV_SUB_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/sub.c)
+endif()
+
+if(CONFIG_THEAD_RVV_SUB_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/sub.c)
+endif()
+
 if(CONFIG_THEAD_RVV_SOFTMAX_FP32)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp32/softmax.c)
 endif()
@@ -386,8 +469,12 @@ if(CONFIG_THEAD_RVV_SOFTMAX_FP16)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/softmax.c)
 endif()
 
-if(CONFIG_THEAD_RVV_REDUCE_SUM_INT8)
-    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/reduce_sum.c)
+if(CONFIG_THEAD_RVV_SOFTMAX_INT8)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/softmax.c)
+endif()
+
+if(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16)
+    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/strided_slice.c)
 endif()
 
 if(CONFIG_THEAD_RVV_TRANSPOSE_FP32)
@@ -401,15 +488,3 @@ endif()
 if(CONFIG_THEAD_RVV_TRANSPOSE_INT8)
     list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/transpose.c)
 endif()
-
-if(CONFIG_THEAD_RVV_GATHER_FP16)
-    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/gather.c)
-endif()
-
-if(CONFIG_THEAD_RVV_GATHER_INT8)
-    list(APPEND THEAD_RVV_SRCS source/thead_rvv/int8/gather.c)
-endif()
-
-if(CONFIG_THEAD_RVV_STRIDED_SLICE_FP16)
-    list(APPEND THEAD_RVV_SRCS source/thead_rvv/fp16/strided_slice.c)
-endif()
\ No newline at end of file
diff --git a/source/thead_rvv/Kconfig b/source/thead_rvv/Kconfig
index a52f3030..be29b2db 100644
--- a/source/thead_rvv/Kconfig
+++ b/source/thead_rvv/Kconfig
@@ -124,28 +124,42 @@ config THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32
 	bool "Layer depthwise convolution fp32"
 	default y
 	help
-		Select SHL build v extension optimized convolution
+		Select SHL build v extension optimized depthwise_convolution
 
 config THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16
 	depends on THEAD_RVV_SOURCE
 	bool "Layer depthwise convolution fp16"
 	default y
 	help
-		Select SHL build v extension optimized convolution
+		Select SHL build v extension optimized depthwise_convolution
 
 config THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8
 	depends on THEAD_RVV_SOURCE
 	bool "Layer depthwise convolution int8"
 	default y
 	help
-		Select SHL build v extension optimized convolution
+		Select SHL build v extension optimized depthwise_convolution
 
 config THEAD_RVV_DEPTHWISE_CONVOLUTION_INT4
 	depends on THEAD_RVV_SOURCE
 	bool "Layer depthwise convolution int4"
 	default y
 	help
-		Select SHL build v extension optimized convolution
+		Select SHL build v extension optimized depthwise_convolution
+
+config THEAD_RVV_CONVOLUTION1D_FP32
+	depends on THEAD_RVV_SOURCE
+	bool "Layer convolution1d fp32"
+	default y
+	help
+		Select SHL build v extension optimized convolution1d
+
+config THEAD_RVV_CONVOLUTION1D_FP16
+	depends on THEAD_RVV_SOURCE
+	bool "Layer convolution1d fp16"
+	default y
+	help
+		Select SHL build v extension optimized convolution1d
 
 config THEAD_RVV_CONVOLUTION1D_INT8
 	depends on THEAD_RVV_SOURCE
@@ -159,7 +173,56 @@ config THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8
 	bool "Layer depthwise convolution1d int8"
 	default y
 	help
-		Select SHL build v extension optimized convolution1d
+		Select SHL build v extension optimized depthwise_convolution1d
+
+config THEAD_RVV_DECONVOLUTION_FP32
+	depends on THEAD_RVV_SOURCE
+	bool "Layer deconvolution fp32"
+	default y
+	help
+		Select SHL build v extension optimized deconvolution
+
+config THEAD_RVV_DECONVOLUTION_FP16
+	depends on THEAD_RVV_SOURCE
+	bool "Layer deconvolution fp16"
+	default y
+	help
+		Select SHL build v extension optimized deconvolution
+
+config THEAD_RVV_DIV_FP32
+	depends on THEAD_RVV_SOURCE
+	bool "Layer div fp32"
+	default y
+	help
+		Select SHL build v extension optimized div
+
+config THEAD_RVV_DIV_FP16
+	depends on THEAD_RVV_SOURCE
+	bool "Layer div fp16"
+	default y
+	help
+		Select SHL build v extension optimized div
+
+config THEAD_RVV_DIV_INT8
+	depends on THEAD_RVV_SOURCE
+	bool "Layer div int8"
+	default y
+	help
+		Select SHL build v extension optimized div
+
+config CONFIG_THEAD_RVV_ERF_FP32
+	depends on THEAD_RVV_SOURCE
+	bool "Layer erf fp32"
+	default y
+	help
+		Select SHL build v extension optimized erf
+
+config CONFIG_THEAD_RVV_ERF_FP16
+	depends on THEAD_RVV_SOURCE
+	bool "Layer erf fp16"
+	default y
+	help
+		Select SHL build v extension optimized erf
 
 config THEAD_RVV_FULLYCONNECTED_FP32
 	depends on THEAD_RVV_SOURCE
@@ -189,33 +252,54 @@ config THEAD_RVV_FULLYCONNECTED_INT4
 	help
 		Select SHL build v extension optimized fullyconnected
 
+config THEAD_RVV_GATHER_FP32
+	depends on THEAD_RVV_SOURCE
+	bool "Layer gather fp32"
+	default y
+	help
+		Select SHL build v extension optimized gather
+
+config THEAD_RVV_GATHER_FP16
+	depends on THEAD_RVV_SOURCE
+	bool "Layer gather fp16"
+	default y
+	help
+		Select SHL build v extension optimized gather
+
+config THEAD_RVV_GATHER_INT8
+	depends on THEAD_RVV_SOURCE
+	bool "Layer gather int8"
+	default y
+	help
+		Select SHL build v extension optimized gather
+
 config THEAD_RVV_GEMM_FP32
 	depends on THEAD_RVV_SOURCE
 	bool "Layer GEMM fp32"
 	default y
 	help
-		Select SHL build v extension optimized fullyconnected
+		Select SHL build v extension optimized gemm
 
 config THEAD_RVV_GEMM_FP16
 	depends on THEAD_RVV_SOURCE
 	bool "Layer GEMM fp16"
 	default y
 	help
-		Select SHL build v extension optimized fullyconnected
+		Select SHL build v extension optimized gemm
 
 config THEAD_RVV_GEMM_INT8
 	depends on THEAD_RVV_SOURCE
 	bool "Layer GEMM int8"
 	default y
 	help
-		Select SHL build v extension optimized fullyconnected
+		Select SHL build v extension optimized gemm
 
 config THEAD_RVV_GEMM_INT4
 	depends on THEAD_RVV_SOURCE
 	bool "Layer GEMM int4"
 	default y
 	help
-		Select SHL build v extension optimized fullyconnected
+		Select SHL build v extension optimized gemm
 
 config THEAD_RVV_GLOBAL_AVERAGEPOOL_FP32
 	depends on THEAD_RVV_SOURCE
@@ -406,6 +490,13 @@ config THEAD_RVV_PRELU_INT8
 	help
 		Select SHL build v extension optimized prelu
 
+config THEAD_RVV_REDUCE_SUM_INT8
+	depends on THEAD_RVV_SOURCE
+	bool "Layer reduce_sum int8"
+	default y
+	help
+		Select SHL build v extension optimized reduce_sum
+
 config THEAD_RVV_RELU_FP32
 	depends on THEAD_RVV_SOURCE
 	bool "Layer relu fp32"
@@ -476,6 +567,27 @@ config THEAD_RVV_SIGMOID_FP16
 	help
 		Select SHL build v extension optimized sigmoid
 
+config THEAD_RVV_SUB_FP32
+	depends on THEAD_RVV_SOURCE
+	bool "Layer sub fp32"
+	default y
+	help
+		Select SHL build v extension optimized sub
+
+config THEAD_RVV_SUB_FP16
+	depends on THEAD_RVV_SOURCE
+	bool "Layer sub fp16"
+	default y
+	help
+		Select SHL build v extension optimized sub
+
+config THEAD_RVV_SUB_INT8
+	depends on THEAD_RVV_SOURCE
+	bool "Layer sub int8"
+	default y
+	help
+		Select SHL build v extension optimized sub
+
 config THEAD_RVV_SOFTMAX_FP32
 	depends on THEAD_RVV_SOURCE
 	bool "Layer softmax fp32"
@@ -490,12 +602,12 @@ config THEAD_RVV_SOFTMAX_FP16
 	help
 		Select SHL build v extension optimized softmax
 
-config THEAD_RVV_REDUCE_SUM_INT8
+config THEAD_RVV_STRIDED_SLICE_FP16
 	depends on THEAD_RVV_SOURCE
-	bool "Layer reduce_sum int8"
+	bool "Layer strided_slice fp16"
 	default y
 	help
-		Select SHL build v extension optimized reduce_sum
+		Select SHL build v extension optimized strided_slice
 
 config THEAD_RVV_TRANSPOSE_FP32
 	depends on THEAD_RVV_SOURCE
@@ -518,25 +630,4 @@ config THEAD_RVV_TRANSPOSE_INT8
 	help
 		Select SHL build v extension optimized transpose
 
-config THEAD_RVV_GATHER_FP16
-	depends on THEAD_RVV_SOURCE
-	bool "Layer gather fp16"
-	default y
-	help
-		Select SHL build v extension optimized gather
-
-config THEAD_RVV_GATHER_INT8
-	depends on THEAD_RVV_SOURCE
-	bool "Layer gather int8"
-	default y
-	help
-		Select SHL build v extension optimized gather
-
-config THEAD_RVV_STRIDED_SLICE_FP16
-	depends on THEAD_RVV_SOURCE
-	bool "Layer strided_slice fp16"
-	default y
-	help
-		Select SHL build v extension optimized strided_slice
-
 endmenu
diff --git a/source/thead_rvv/binary_broadcast.c b/source/thead_rvv/binary_broadcast.c
new file mode 100644
index 00000000..20693555
--- /dev/null
+++ b/source/thead_rvv/binary_broadcast.c
@@ -0,0 +1,569 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static int check_input_dim_count(struct csinn_tensor *input, struct csinn_tensor *output)
+{
+    int target_dim_count = output->dim_count;
+    if (output->layout >= CSINN_LAYOUT_NC1C0 && output->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        target_dim_count -= 1;
+    }
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        target_dim_count += 1;
+    }
+    if (input->dim_count <= target_dim_count) {
+        return CSINN_TRUE;
+    }
+    return CSINN_FALSE;
+}
+
+static void fill_input_dim(struct csinn_tensor *input, struct csinn_tensor *output)
+{
+    int in_dim_count = input->dim_count;
+    int target_dim_count = output->dim_count;
+    if (output->layout >= CSINN_LAYOUT_NC1C0 && output->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        target_dim_count -= 1;
+    }
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        target_dim_count += 1;
+    }
+    if (in_dim_count < target_dim_count) {
+        input->dim_count = target_dim_count;
+        if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+            if (input->dim_count == 3)
+                input->layout = CSINN_LAYOUT_NC1C0;
+            else if (input->dim_count == 4)
+                input->layout = CSINN_LAYOUT_NC1WC0;
+            else if (input->dim_count == 5)
+                input->layout = CSINN_LAYOUT_NC1HWC0;
+            else if (input->dim_count == 6)
+                input->layout = CSINN_LAYOUT_NC1DHWC0;
+        } else if (input->layout >= CSINN_LAYOUT_N && input->layout <= CSINN_LAYOUT_NCDHW) {
+            if (input->dim_count == 1)
+                input->layout = CSINN_LAYOUT_N;
+            else if (input->dim_count == 2)
+                input->layout = CSINN_LAYOUT_NC;
+            else if (input->dim_count == 3)
+                input->layout = CSINN_LAYOUT_NCW;
+            else if (input->dim_count == 4)
+                input->layout = CSINN_LAYOUT_NCHW;
+            else if (input->dim_count == 5)
+                input->layout = CSINN_LAYOUT_NCDHW;
+            else if (input->dim_count == 6)
+                input->layout = CSINN_LAYOUT_NLCDHW;
+        }
+        for (int i = target_dim_count - 1; i >= target_dim_count - in_dim_count; i--) {
+            input->dim[i] = input->dim[i - (target_dim_count - in_dim_count)];
+        }
+        for (int i = target_dim_count - in_dim_count - 1; i >= 0; i--) {
+            input->dim[i] = 1;
+        }
+    }
+}
+
+static int check_broadcast_rule(struct csinn_tensor *input, struct csinn_tensor *output)
+{
+    for (int i = 0; i < input->dim_count; i++) {
+        if ((input->dim[input->dim_count - i - 1] != output->dim[output->dim_count - i - 1]) &&
+            (input->dim[input->dim_count - i - 1] != 1)) {
+            return CSINN_FALSE;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+static int broadcast_get_index(int32_t *dim, int32_t *idx, int32_t dim_count)
+{
+    int res = 0;
+    for (int i = 0; i < dim_count; i++) {
+        if (dim[i] != 1) {
+            int tmp = idx[i];
+            for (int j = i + 1; j < dim_count; j++) {
+                tmp *= dim[j];
+            }
+            res += tmp;
+        }
+    }
+    return res;
+}
+
+static int layout_try_ndarray_to_nc1xc0(struct csinn_tensor *t, int packn)
+{
+    if (t->layout >= CSINN_LAYOUT_NC && t->layout <= CSINN_LAYOUT_NCDHW) {
+        if (t->dim[1] % packn == 0) {
+            t->dim[1] /= packn;
+            t->dim_count = t->dim_count + 1;
+            t->dim[t->dim_count - 1] = packn;
+        } else if (t->dim[1] == 1) {
+            t->dim_count = t->dim_count + 1;
+            t->dim[t->dim_count - 1] = 1;
+        } else {
+            shl_debug_error("The dimension of tensor do not meet the rules of broadcast!");
+            return CSINN_FALSE;
+        }
+        if (t->layout == CSINN_LAYOUT_NCDHW) {
+            t->layout = CSINN_LAYOUT_NC1DHWC0;
+        } else if (t->layout == CSINN_LAYOUT_NCHW) {
+            t->layout = CSINN_LAYOUT_NC1HWC0;
+        } else if (t->layout == CSINN_LAYOUT_NCW) {
+            t->layout = CSINN_LAYOUT_NC1WC0;
+        } else if (t->layout == CSINN_LAYOUT_NC) {
+            t->layout = CSINN_LAYOUT_NC1C0;
+        }
+        return CSINN_TRUE;
+    }
+    return CSINN_FALSE;
+}
+
+static int layout_try_nc1xc0_to_ndarray(struct csinn_tensor *t)
+{
+    if (t->layout >= CSINN_LAYOUT_NC1C0 && t->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        int in_c1 = t->dim[1];
+        int in_c0 = t->dim[t->dim_count - 1];
+        t->dim[1] = in_c1 * in_c0;
+        t->dim[t->dim_count - 1] = 0;
+        t->dim_count = t->dim_count - 1;
+        if (t->layout == CSINN_LAYOUT_NC1DHWC0) {
+            t->layout = CSINN_LAYOUT_NCDHW;
+        } else if (t->layout == CSINN_LAYOUT_NC1HWC0) {
+            t->layout = CSINN_LAYOUT_NCHW;
+        } else if (t->layout == CSINN_LAYOUT_NC1WC0) {
+            t->layout = CSINN_LAYOUT_NCW;
+        } else if (t->layout == CSINN_LAYOUT_NC1C0) {
+            t->layout = CSINN_LAYOUT_NC;
+        }
+        return CSINN_TRUE;
+    }
+    return CSINN_FALSE;
+}
+
+static void transform_layout_weight_to_activation(struct csinn_tensor *t)
+{
+    if (t->layout == CSINN_LAYOUT_O) {
+        t->layout = CSINN_LAYOUT_N;
+    } else if (t->layout == CSINN_LAYOUT_O) {
+        t->layout = CSINN_LAYOUT_NC;
+    } else if (t->layout == CSINN_LAYOUT_OI) {
+        t->layout = CSINN_LAYOUT_NCW;
+    } else if (t->layout == CSINN_LAYOUT_OIHW) {
+        t->layout = CSINN_LAYOUT_NCHW;
+    } else if (t->layout == CSINN_LAYOUT_OIDHW) {
+        t->layout = CSINN_LAYOUT_NCDHW;
+    }
+}
+
+static void tensor_try_ndarray_to_nc1xc0_fp32(struct csinn_tensor *t)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    int batch = t->dim[0];
+    int in_c = t->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < t->dim_count; i++) {
+        inner_size *= t->dim[i];
+    }
+
+    if (layout_try_ndarray_to_nc1xc0(t, packn)) {
+        if (t->dim[t->dim_count - 1] != 1) {
+            float *src = t->data;
+            float *dst = (float *)shl_mem_alloc(csinn_tensor_byte_size(t));
+
+            int vl = vsetvl_e32m1(packn);
+            int batch_size = in_c * inner_size;
+
+            float *out_ptr = dst;
+            for (int b = 0; b < batch; b++) {
+                for (int c = 0; c + packn - 1 < in_c; c += packn) {
+                    float *in_ptr = src + b * batch_size + c * inner_size;
+                    for (int i = 0; i < inner_size; i++) {
+                        vfloat32m1_t _tmp = vlse32_v_f32m1(in_ptr, inner_size * sizeof(float), vl);
+                        in_ptr++;
+                        vse32_v_f32m1(out_ptr, _tmp, vl);
+                        out_ptr += vl;
+                    }
+                }
+            }
+            shl_mem_free(t->data);
+            t->data = dst;
+        }
+    }
+}
+
+static void tensor_try_ndarray_to_nc1xc0_fp16(struct csinn_tensor *t)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int batch = t->dim[0];
+    int in_c = t->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < t->dim_count; i++) {
+        inner_size *= t->dim[i];
+    }
+
+    if (layout_try_ndarray_to_nc1xc0(t, packn)) {
+        if (t->dim[t->dim_count - 1] != 1) {
+            __fp16 *src = t->data;
+            __fp16 *dst = (__fp16 *)shl_mem_alloc(csinn_tensor_byte_size(t));
+
+            int vl = vsetvl_e16m1(packn);
+            int batch_size = in_c * inner_size;
+
+            __fp16 *out_ptr = dst;
+            for (int b = 0; b < batch; b++) {
+                for (int c = 0; c + packn - 1 < in_c; c += packn) {
+                    __fp16 *in_ptr = src + b * batch_size + c * inner_size;
+                    for (int i = 0; i < inner_size; i++) {
+                        vfloat16m1_t _tmp = vlse16_v_f16m1(in_ptr, inner_size * sizeof(__fp16), vl);
+                        in_ptr++;
+                        vse16_v_f16m1(out_ptr, _tmp, vl);
+                        out_ptr += vl;
+                    }
+                }
+            }
+            shl_mem_free(t->data);
+            t->data = dst;
+        }
+    }
+}
+
+static void tensor_try_ndarray_to_nc1xc0_int8(struct csinn_tensor *t)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int batch = t->dim[0];
+    int in_c = t->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < t->dim_count; i++) {
+        inner_size *= t->dim[i];
+    }
+
+    if (layout_try_ndarray_to_nc1xc0(t, packn)) {
+        if (t->dim[t->dim_count - 1] != 1) {
+            int8_t *src = t->data;
+            int8_t *dst = (int8_t *)shl_mem_alloc(csinn_tensor_byte_size(t));
+
+            int vl = vsetvl_e8m1(packn);
+            int batch_size = in_c * inner_size;
+
+            int8_t *out_ptr = dst;
+            for (int b = 0; b < batch; b++) {
+                for (int c = 0; c + packn - 1 < in_c; c += packn) {
+                    int8_t *in_ptr = src + b * batch_size + c * inner_size;
+                    for (int i = 0; i < inner_size; i++) {
+                        vint8m1_t _tmp = vlse8_v_i8m1(in_ptr, inner_size * sizeof(int8_t), vl);
+                        in_ptr++;
+                        vse8_v_i8m1(out_ptr, _tmp, vl);
+                        out_ptr += vl;
+                    }
+                }
+            }
+            shl_mem_free(t->data);
+            t->data = dst;
+        }
+    }
+}
+
+int shl_rvv_binary_op_broadcast_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                     struct csinn_tensor *output, void *binary_op_callback[])
+{
+    if (!check_input_dim_count(input0, output)) {
+        shl_debug_error("input0 dim_count greater than output!\n");
+        return CSINN_FALSE;
+    }
+    if (!check_input_dim_count(input1, output)) {
+        shl_debug_error("input1 dim_count greater than output!\n");
+        return CSINN_FALSE;
+    }
+
+    fill_input_dim(input0, output);
+    fill_input_dim(input1, output);
+
+    const int packn = csrr_vlenb() / sizeof(float);
+
+    struct csinn_tensor *in1_extra;
+    bool in1_extra_flag = false;
+
+    if (input0->layout >= CSINN_LAYOUT_NC1C0 && input0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        if (input1->is_const) {
+            in1_extra = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(in1_extra, input1);
+            in1_extra->data = shl_mem_alloc(csinn_tensor_byte_size(input1));
+            memcpy(in1_extra->data, input1->data, csinn_tensor_byte_size(input1));
+            transform_layout_weight_to_activation(in1_extra);
+            in1_extra_flag = true;
+            input1 = in1_extra;
+        }
+        tensor_try_ndarray_to_nc1xc0_fp32(input1);
+        layout_try_ndarray_to_nc1xc0(output, packn);
+    } else if (input0->layout >= CSINN_LAYOUT_N && input0->layout <= CSINN_LAYOUT_NCDHW) {
+        if (input1->layout >= CSINN_LAYOUT_NC1C0 && input1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+            shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input1);
+        }
+        layout_try_nc1xc0_to_ndarray(output);
+    }
+
+    if (!check_broadcast_rule(input0, output)) {
+        shl_debug_error("The dimension of input0 do not meet the rules of broadcast!\n");
+        return CSINN_FALSE;
+    }
+    if (!check_broadcast_rule(input1, output)) {
+        shl_debug_error("The dimension of input1 do not meet the rules of broadcast!\n");
+        return CSINN_FALSE;
+    }
+
+    float *input0_data = (float *)input0->data;
+    float *input1_data = (float *)input1->data;
+    float *output_data = (float *)output->data;
+
+    int32_t *in0_dim = input0->dim;
+    int32_t *in1_dim = input1->dim;
+    int32_t *out_dim = output->dim;
+    int32_t dim_count = output->dim_count;
+
+    int32_t *idx = (int32_t *)shl_mem_alloc(dim_count * sizeof(int32_t));
+    int cur = 0;
+
+    void (*binary_op)();
+    if (in0_dim[dim_count - 1] == in1_dim[dim_count - 1]) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_VV];
+    } else if (in1_dim[dim_count - 1] == 1) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_VS];
+    } else if (in0_dim[dim_count - 1] == 1) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_SV];
+    }
+
+    while (idx[0] < out_dim[0]) {
+        if (cur == dim_count - 1) {
+            float *in0_ptr = input0_data + broadcast_get_index(in0_dim, idx, dim_count);
+            float *in1_ptr = input1_data + broadcast_get_index(in1_dim, idx, dim_count);
+            float *out_ptr = output_data + broadcast_get_index(out_dim, idx, dim_count);
+            binary_op(in0_ptr, in1_ptr, out_ptr, out_dim[cur]);
+            cur -= 1;
+            idx[cur] += 1;
+        } else {
+            if (idx[cur] < out_dim[cur]) {
+                cur += 1;
+            } else {
+                idx[cur] = 0;
+                cur -= 1;
+                idx[cur] += 1;
+            }
+        }
+    }
+
+    if (in1_extra_flag) {
+        shl_mem_free(in1_extra->data);
+        csinn_free_tensor(in1_extra);
+    }
+
+    return CSINN_TRUE;
+}
+
+int shl_rvv_binary_op_broadcast_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                     struct csinn_tensor *output, void *binary_op_callback[])
+{
+    if (!check_input_dim_count(input0, output)) {
+        shl_debug_error("input0 dim_count greater than output!\n");
+        return CSINN_FALSE;
+    }
+    if (!check_input_dim_count(input1, output)) {
+        shl_debug_error("input1 dim_count greater than output!\n");
+        return CSINN_FALSE;
+    }
+
+    fill_input_dim(input0, output);
+    fill_input_dim(input1, output);
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    struct csinn_tensor *in1_extra;
+    bool in1_extra_flag = false;
+
+    if (input0->layout >= CSINN_LAYOUT_NC1C0 && input0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        if (input1->is_const) {
+            in1_extra = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(in1_extra, input1);
+            in1_extra->data = shl_mem_alloc(csinn_tensor_byte_size(input1));
+            memcpy(in1_extra->data, input1->data, csinn_tensor_byte_size(input1));
+            transform_layout_weight_to_activation(in1_extra);
+            in1_extra_flag = true;
+            input1 = in1_extra;
+        }
+        tensor_try_ndarray_to_nc1xc0_fp16(input1);
+        layout_try_ndarray_to_nc1xc0(output, packn);
+    } else if (input0->layout >= CSINN_LAYOUT_N && input0->layout <= CSINN_LAYOUT_NCDHW) {
+        if (input1->layout >= CSINN_LAYOUT_NC1C0 && input1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+            shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input1);
+        }
+        layout_try_nc1xc0_to_ndarray(output);
+    }
+
+    if (!check_broadcast_rule(input0, output)) {
+        shl_debug_error("The dimension of input0 do not meet the rules of broadcast!\n");
+        return CSINN_FALSE;
+    }
+    if (!check_broadcast_rule(input1, output)) {
+        shl_debug_error("The dimension of input1 do not meet the rules of broadcast!\n");
+        return CSINN_FALSE;
+    }
+
+    __fp16 *input0_data = (__fp16 *)input0->data;
+    __fp16 *input1_data = (__fp16 *)input1->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int32_t *in0_dim = input0->dim;
+    int32_t *in1_dim = input1->dim;
+    int32_t *out_dim = output->dim;
+    int32_t dim_count = output->dim_count;
+
+    int32_t *idx = (int32_t *)shl_mem_alloc(dim_count * sizeof(int32_t));
+    int cur = 0;
+
+    void (*binary_op)();
+    if (in0_dim[dim_count - 1] == in1_dim[dim_count - 1]) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_VV];
+    } else if (in1_dim[dim_count - 1] == 1) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_VS];
+    } else if (in0_dim[dim_count - 1] == 1) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_SV];
+    }
+
+    while (idx[0] < out_dim[0]) {
+        if (cur == dim_count - 1) {
+            __fp16 *in0_ptr = input0_data + broadcast_get_index(in0_dim, idx, dim_count);
+            __fp16 *in1_ptr = input1_data + broadcast_get_index(in1_dim, idx, dim_count);
+            __fp16 *out_ptr = output_data + broadcast_get_index(out_dim, idx, dim_count);
+            binary_op(in0_ptr, in1_ptr, out_ptr, out_dim[cur]);
+            cur -= 1;
+            idx[cur] += 1;
+        } else {
+            if (idx[cur] < out_dim[cur]) {
+                cur += 1;
+            } else {
+                idx[cur] = 0;
+                cur -= 1;
+                idx[cur] += 1;
+            }
+        }
+    }
+
+    if (in1_extra_flag) {
+        shl_mem_free(in1_extra->data);
+        csinn_free_tensor(in1_extra);
+    }
+
+    return CSINN_TRUE;
+}
+
+int shl_rvv_binary_op_broadcast_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                     struct csinn_tensor *output, void *binary_op_callback[])
+{
+    if (!check_input_dim_count(input0, output)) {
+        shl_debug_error("input0 dim_count greater than output!\n");
+        return CSINN_FALSE;
+    }
+    if (!check_input_dim_count(input1, output)) {
+        shl_debug_error("input1 dim_count greater than output!\n");
+        return CSINN_FALSE;
+    }
+
+    fill_input_dim(input0, output);
+    fill_input_dim(input1, output);
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    struct csinn_tensor *in1_extra;
+    bool in1_extra_flag = false;
+
+    if (input0->layout >= CSINN_LAYOUT_NC1C0 && input0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        if (input1->is_const) {
+            in1_extra = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(in1_extra, input1);
+            in1_extra->data = shl_mem_alloc(csinn_tensor_byte_size(input1));
+            memcpy(in1_extra->data, input1->data, csinn_tensor_byte_size(input1));
+            transform_layout_weight_to_activation(in1_extra);
+            in1_extra_flag = true;
+            input1 = in1_extra;
+        }
+        tensor_try_ndarray_to_nc1xc0_int8(input1);
+        layout_try_ndarray_to_nc1xc0(output, packn);
+    } else if (input0->layout >= CSINN_LAYOUT_N && input0->layout <= CSINN_LAYOUT_NCDHW) {
+        if (input1->layout >= CSINN_LAYOUT_NC1C0 && input1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+            shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input1);
+        }
+        layout_try_nc1xc0_to_ndarray(output);
+    }
+
+    if (!check_broadcast_rule(input0, output)) {
+        shl_debug_error("The dimension of input0 do not meet the rules of broadcast!\n");
+        return CSINN_FALSE;
+    }
+    if (!check_broadcast_rule(input1, output)) {
+        shl_debug_error("The dimension of input1 do not meet the rules of broadcast!\n");
+        return CSINN_FALSE;
+    }
+
+    int8_t *input0_data = (int8_t *)input0->data;
+    int8_t *input1_data = (int8_t *)input1->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int32_t *in0_dim = input0->dim;
+    int32_t *in1_dim = input1->dim;
+    int32_t *out_dim = output->dim;
+    int32_t dim_count = output->dim_count;
+
+    int32_t *idx = (int32_t *)shl_mem_alloc(dim_count * sizeof(int32_t));
+    int cur = 0;
+
+    void (*binary_op)();
+    if (in0_dim[dim_count - 1] == in1_dim[dim_count - 1]) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_VV];
+    } else if (in1_dim[dim_count - 1] == 1) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_VS];
+    } else if (in0_dim[dim_count - 1] == 1) {
+        binary_op = binary_op_callback[CSINN_BROADCAST_SV];
+    }
+
+    float scale[3] = {input0->qinfo->scale, input1->qinfo->scale, output->qinfo->scale};
+    int32_t zero_point[3] = {input0->qinfo->zero_point, input1->qinfo->zero_point,
+                             output->qinfo->zero_point};
+
+    while (idx[0] < out_dim[0]) {
+        if (cur == dim_count - 1) {
+            int8_t *in0_ptr = input0_data + broadcast_get_index(in0_dim, idx, dim_count);
+            int8_t *in1_ptr = input1_data + broadcast_get_index(in1_dim, idx, dim_count);
+            int8_t *out_ptr = output_data + broadcast_get_index(out_dim, idx, dim_count);
+            binary_op(in0_ptr, in1_ptr, out_ptr, out_dim[cur], scale, zero_point);
+            cur -= 1;
+            idx[cur] += 1;
+        } else {
+            if (idx[cur] < out_dim[cur]) {
+                cur += 1;
+            } else {
+                idx[cur] = 0;
+                cur -= 1;
+                idx[cur] += 1;
+            }
+        }
+    }
+
+    if (in1_extra_flag) {
+        shl_mem_free(in1_extra->data);
+        csinn_free_tensor(in1_extra);
+    }
+
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/capability.c b/source/thead_rvv/capability.c
index bf8e1b36..1a51a3d0 100644
--- a/source/thead_rvv/capability.c
+++ b/source/thead_rvv/capability.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static int common_all_support(struct csinn_tensor *input, struct csinn_params_base *base)
 {
@@ -132,16 +132,28 @@ int shl_rvv_conv1d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
 {
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_w = params->stride_width;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_w = params->dilation_width;
     int32_t group = params->group;
-    if (input->dtype == CSINN_DTYPE_FLOAT16) {
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
         if (group == 1) {
-            if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) {
+            return CSINN_OPT_INTRINSIC;
+        }
+        // dwconv1d
+        else if (group == input->dim[1] && kernel->dim[1] == 1) {
+            if (bias->data != NULL && bias->dim_count != 0) {
                 return CSINN_OPT_INTRINSIC;
             } else {
                 return CSINN_OPT_C_REFERENCE;
             }
         }
+        // group conv1d
+        else {
+            return CSINN_OPT_C_REFERENCE;
+        }
+    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+        if (group == 1) {
+            return CSINN_OPT_INTRINSIC;
+        }
         // dwconv1d
         else if (group == input->dim[1] && kernel->dim[1] == 1) {
             if (bias->data != NULL && bias->dim_count != 0) {
@@ -159,6 +171,19 @@ int shl_rvv_conv1d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
     return CSINN_OPT_UNSUPPORTED;
 }
 
+int shl_rvv_deconv2d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params)
+{
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
+        return CSINN_OPT_INTRINSIC;
+    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+        return CSINN_OPT_INTRINSIC;
+    } else {
+        return CSINN_OPT_UNSUPPORTED;
+    }
+}
+
 int shl_rvv_fullyconnected_cap(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_tensor *weights, struct csinn_tensor *bias,
                                struct csinn_fc_params *params)
@@ -408,103 +433,55 @@ int shl_rvv_avgpool2d_cap(struct csinn_tensor *input, struct csinn_tensor *outpu
     return CSINN_OPT_UNSUPPORTED;
 }
 
-static int c920_tail_coincide(struct csinn_tensor *input0, struct csinn_tensor *input1)
+int shl_rvv_add_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    int flag = 1;
-    int i = 0, j = 0;
-    for (i = input1->dim_count - 1, j = input0->dim_count - 1; i >= 0; i--, j--) {
-        if (input0->dim[j] != input1->dim[i]) {
-            flag = 0;
-            break;
-        }
-    }
-    flag = 1;
-    for (; i >= 0; i--) {
-        if (input1->dim[i] != 1) {
-            flag = 0;
-            break;
-        }
+    if (input0->dtype == CSINN_DTYPE_FLOAT16) {
+        return CSINN_OPT_INTRINSIC;
+    } else if (input0->dtype == CSINN_DTYPE_FLOAT32) {
+        return CSINN_OPT_INTRINSIC;
+    } else if (input0->dtype == CSINN_DTYPE_INT8) {
+        return CSINN_OPT_INTRINSIC;
     }
-    return flag;
+    return CSINN_OPT_UNSUPPORTED;
 }
 
-int shl_rvv_add_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
+int shl_rvv_sub_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    int in_size0 = csinn_tensor_size(input0);
-    int in_size1 = csinn_tensor_size(input1);
     if (input0->dtype == CSINN_DTYPE_FLOAT16) {
-        if (in_size1 == 1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (in_size0 == in_size1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (c920_tail_coincide(input0, input1)) {
-            return CSINN_OPT_INTRINSIC;
-        } else {
-            return CSINN_OPT_C_REFERENCE;
-        }
+        return CSINN_OPT_INTRINSIC;
     } else if (input0->dtype == CSINN_DTYPE_FLOAT32) {
-        if (in_size1 == 1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (in_size0 == in_size1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (c920_tail_coincide(input0, input1)) {
-            return CSINN_OPT_INTRINSIC;
-        } else {
-            return CSINN_OPT_C_REFERENCE;
-        }
+        return CSINN_OPT_INTRINSIC;
     } else if (input0->dtype == CSINN_DTYPE_INT8) {
-        if (in_size1 == 1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (in_size0 == in_size1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (c920_tail_coincide(input0, input1)) {
-            return CSINN_OPT_INTRINSIC;
-        } else {
-            return CSINN_OPT_C_REFERENCE;
-        }
+        return CSINN_OPT_INTRINSIC;
     }
-
     return CSINN_OPT_UNSUPPORTED;
 }
 
 int shl_rvv_mul_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    int in_size0 = csinn_tensor_size(input0);
-    int in_size1 = csinn_tensor_size(input1);
     if (input0->dtype == CSINN_DTYPE_FLOAT16) {
-        if (in_size1 == 1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (in_size0 == in_size1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (c920_tail_coincide(input0, input1)) {
-            return CSINN_OPT_INTRINSIC;
-        } else {
-            return CSINN_OPT_C_REFERENCE;
-        }
+        return CSINN_OPT_INTRINSIC;
     } else if (input0->dtype == CSINN_DTYPE_FLOAT32) {
-        if (in_size1 == 1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (in_size0 == in_size1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (c920_tail_coincide(input0, input1)) {
-            return CSINN_OPT_INTRINSIC;
-        } else {
-            return CSINN_OPT_C_REFERENCE;
-        }
+        return CSINN_OPT_INTRINSIC;
     } else if (input0->dtype == CSINN_DTYPE_INT8) {
-        if (in_size1 == 1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (in_size0 == in_size1) {
-            return CSINN_OPT_INTRINSIC;
-        } else if (c920_tail_coincide(input0, input1)) {
-            return CSINN_OPT_INTRINSIC;
-        } else {
-            return CSINN_OPT_C_REFERENCE;
-        }
+        return CSINN_OPT_INTRINSIC;
     }
+    return CSINN_OPT_UNSUPPORTED;
+}
 
+int shl_rvv_div_cap(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    if (input0->dtype == CSINN_DTYPE_FLOAT16) {
+        return CSINN_OPT_INTRINSIC;
+    } else if (input0->dtype == CSINN_DTYPE_FLOAT32) {
+        return CSINN_OPT_INTRINSIC;
+    } else if (input0->dtype == CSINN_DTYPE_INT8) {
+        return CSINN_OPT_INTRINSIC;
+    }
     return CSINN_OPT_UNSUPPORTED;
 }
 
@@ -529,7 +506,7 @@ int shl_rvv_relu_cap(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_rvv_relu6_cap(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_relu_params *params)
 {
-    return float_all_support(input, &(params->base));
+    return common_all_support(input, &(params->base));
 }
 
 int shl_rvv_global_avgpool2d_cap(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -553,13 +530,13 @@ int shl_rvv_reshape_cap(struct csinn_tensor *input, struct csinn_tensor *output,
 int shl_rvv_sigmoid_cap(struct csinn_tensor *input, struct csinn_tensor *output,
                         struct csinn_sigmoid_params *params)
 {
-    return float_all_support(input, &(params->base));
+    return common_all_support(input, &(params->base));
 }
 
 int shl_rvv_softmax_cap(struct csinn_tensor *input, struct csinn_tensor *output,
                         struct csinn_softmax_params *params)
 {
-    return float_all_support(input, &(params->base));
+    return common_all_support(input, &(params->base));
 }
 
 int shl_rvv_reduce_sum_cap(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -585,7 +562,7 @@ int shl_rvv_layer_norm_cap(struct csinn_tensor *input, struct csinn_tensor *outp
     if (params->center == false || params->scale == false) {
         return CSINN_OPT_UNSUPPORTED;
     }
-    return float_all_support(input, &(params->base));
+    return common_all_support(input, &(params->base));
 }
 
 int shl_rvv_clip_cap(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -648,25 +625,24 @@ int shl_rvv_transpose_cap(struct csinn_tensor *input, struct csinn_tensor *outpu
 int shl_rvv_matmul_cap(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
                        struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
-    const int dims_count = mat0->dim_count;
     int batches_a = 1;
     int batches_b = 1;
 
     /* compute the outer size */
-    for (int i = 0; i < dims_count - 2; i++) {
+    for (int i = 0; i < mat0->dim_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     if (mat0->dtype == CSINN_DTYPE_FLOAT32 && mat1->dtype == CSINN_DTYPE_FLOAT32 ||
         mat0->dtype == CSINN_DTYPE_FLOAT16 &&
             (mat1->dtype == CSINN_DTYPE_FLOAT16 || mat1->dtype == CSINN_DTYPE_INT8)) {
-        if (batches_a == batches_b) {
-            if (!params->trans_a && !params->trans_b) {
+        if (!params->trans_a && !params->trans_b) {
+            if (batches_a == batches_b) {
                 return CSINN_OPT_INTRINSIC;
-            }
-        } else if (batches_a > 1 && batches_b == 1) {
-            if (!params->trans_a && !params->trans_b) {
+            } else if (batches_a > 1 && batches_b == 1) {
                 return CSINN_OPT_INTRINSIC;
             }
         }
@@ -688,10 +664,18 @@ int shl_rvv_matmul_cap(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
 int shl_rvv_gather_cap(struct csinn_tensor *input, struct csinn_tensor *indices,
                        struct csinn_tensor *output, struct csinn_gather_params *params)
 {
-    if (input->dtype == CSINN_DTYPE_INT8) {
-        if (input->dtype == CSINN_DTYPE_INT8 && output->dtype == CSINN_DTYPE_FLOAT16) {
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
+        if (indices->dtype == CSINN_DTYPE_INT64 && output->dtype == CSINN_DTYPE_FLOAT32) {
+            return CSINN_OPT_INTRINSIC;
+        }
+    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+        if (indices->dtype == CSINN_DTYPE_INT64 && output->dtype == CSINN_DTYPE_FLOAT16) {
             return CSINN_OPT_INTRINSIC;
-        } else if (input->dtype == CSINN_DTYPE_INT8 && output->dtype == CSINN_DTYPE_INT8) {
+        }
+    } else if (input->dtype == CSINN_DTYPE_INT8) {
+        if (indices->dtype == CSINN_DTYPE_INT64 && output->dtype == CSINN_DTYPE_FLOAT16) {
+            return CSINN_OPT_INTRINSIC;
+        } else if (indices->dtype == CSINN_DTYPE_INT64 && output->dtype == CSINN_DTYPE_INT8) {
             return CSINN_OPT_INTRINSIC;
         } else {
             return CSINN_OPT_C_REFERENCE;
@@ -700,3 +684,9 @@ int shl_rvv_gather_cap(struct csinn_tensor *input, struct csinn_tensor *indices,
 
     return CSINN_OPT_UNSUPPORTED;
 }
+
+int shl_rvv_erf_cap(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_clip_params *params)
+{
+    return common_all_support(input, &(params->base));
+}
diff --git a/source/thead_rvv/data_convert.c b/source/thead_rvv/data_convert.c
index eba5569b..e52a39e4 100644
--- a/source/thead_rvv/data_convert.c
+++ b/source/thead_rvv/data_convert.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef XTHEADVDOT
 int shl_rvv_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output,
                               struct csinn_siso_params *params)
diff --git a/source/thead_rvv/fp16/add.c b/source/thead_rvv/fp16/add.c
index e2feabf9..a3767ccc 100644
--- a/source/thead_rvv/fp16/add.c
+++ b/source/thead_rvv/fp16/add.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -61,6 +61,45 @@ static void broadcast_single_1_add_fp16(struct csinn_tensor *input0, struct csin
     }
 }
 
+static inline void add_vv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _b = vle16_v_f16m4(in1, vl);
+        vfloat16m4_t _c = vfadd_vv_f16m4(_a, _b, vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void add_vf_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _c = vfadd_vf_f16m4(_a, in1[0], vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void add_fv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    add_vf_f16m4(in1, in0, out, size);
+}
+
+void *add_cb_fp16[] = {
+    [CSINN_BROADCAST_VV] = add_vv_f16m4,
+    [CSINN_BROADCAST_VS] = add_vf_f16m4,
+    [CSINN_BROADCAST_SV] = add_fv_f16m4,
+};
+
 int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
@@ -89,8 +128,7 @@ int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
         }
         broadcast_single_1_add_fp16(input0, input1, output);
     } else {
-        /* TODO: recursive opt */
-        return shl_ref_add_quant(input0, input1, output, params);
+        return shl_rvv_binary_op_broadcast_fp16(input0, input1, output, add_cb_fp16);
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/fp16/avgpool.c b/source/thead_rvv/fp16/avgpool.c
index 8addb04b..8f2443b1 100644
--- a/source/thead_rvv/fp16/avgpool.c
+++ b/source/thead_rvv/fp16/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global avgpool2d
diff --git a/source/thead_rvv/fp16/avgpool_2x2_fp16.c b/source/thead_rvv/fp16/avgpool_2x2_fp16.c
index d1b7be92..ba85167d 100644
--- a/source/thead_rvv/fp16/avgpool_2x2_fp16.c
+++ b/source/thead_rvv/fp16/avgpool_2x2_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/avgpool_2x2_fp16_packn.c b/source/thead_rvv/fp16/avgpool_2x2_fp16_packn.c
index 9780b802..4c6cacd7 100644
--- a/source/thead_rvv/fp16/avgpool_2x2_fp16_packn.c
+++ b/source/thead_rvv/fp16/avgpool_2x2_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp16/avgpool_3x3_fp16.c b/source/thead_rvv/fp16/avgpool_3x3_fp16.c
index d3ddb2f9..4ec17a4b 100644
--- a/source/thead_rvv/fp16/avgpool_3x3_fp16.c
+++ b/source/thead_rvv/fp16/avgpool_3x3_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/avgpool_3x3_fp16_packn.c b/source/thead_rvv/fp16/avgpool_3x3_fp16_packn.c
index 6249d402..d72f984c 100644
--- a/source/thead_rvv/fp16/avgpool_3x3_fp16_packn.c
+++ b/source/thead_rvv/fp16/avgpool_3x3_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp16/avgpool_fp16_nhwc.c b/source/thead_rvv/fp16/avgpool_fp16_nhwc.c
index 1888ca70..1c0df2b6 100644
--- a/source/thead_rvv/fp16/avgpool_fp16_nhwc.c
+++ b/source/thead_rvv/fp16/avgpool_fp16_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
diff --git a/source/thead_rvv/fp16/avgpool_fp16_packn.c b/source/thead_rvv/fp16/avgpool_fp16_packn.c
index d5ee8492..67d904aa 100644
--- a/source/thead_rvv/fp16/avgpool_fp16_packn.c
+++ b/source/thead_rvv/fp16/avgpool_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * constrain: input channel % packn = 0
diff --git a/source/thead_rvv/fp16/clip.c b/source/thead_rvv/fp16/clip.c
index bb068f6f..1ea5f220 100644
--- a/source/thead_rvv/fp16/clip.c
+++ b/source/thead_rvv/fp16/clip.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_clip_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_clip_params *params)
diff --git a/source/thead_rvv/fp16/concat.c b/source/thead_rvv/fp16/concat.c
index 9c333d3f..095103c9 100644
--- a/source/thead_rvv/fp16/concat.c
+++ b/source/thead_rvv/fp16/concat.c
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static int shl_rvv_concat_ndarray_fp16(struct csinn_tensor **input, struct csinn_tensor *output,
                                        struct csinn_concat_params *params)
@@ -51,7 +51,9 @@ static int shl_rvv_concat_ndarray_fp16(struct csinn_tensor **input, struct csinn
             __fp16 in_scale = input_item->qinfo->scale;
             int copy_size = input_item->dim[params->axis] * base_inner_size;
             __fp16 *input_ptr = input_item_data + k * copy_size;
-
+            if ((fabs(in_scale - 1) > FLT_EPSILON || fabs(out_scale - 1) > FLT_EPSILON)) {
+                shl_rvv_requantize_fp16(input_ptr, in_scale / out_scale, copy_size);
+            }
             while (copy_size > 0) {
                 vl = vsetvl_e16m2(copy_size);
                 vfloat16m2_t _input = vle16_v_f16m2(input_ptr, vl);
diff --git a/source/thead_rvv/fp16/convolution.c b/source/thead_rvv/fp16/convolution.c
index 43dfa9a5..cadb955f 100644
--- a/source/thead_rvv/fp16/convolution.c
+++ b/source/thead_rvv/fp16/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -30,8 +30,8 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     const int packn = csrr_vlenb() / sizeof(__fp16);
@@ -48,13 +48,18 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
         if (shl_is_first_layer_input(input, sess)) {
             in_elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
 
     if (input->layout == CSINN_LAYOUT_NHWC) {
+        kernel_h = kernel->dim[1];
+        kernel_w = kernel->dim[2];
         if (params->group == 1 && kernel_h == 3 && kernel_w == 3 && stride_h == 1 &&
-            stride_w == 1 && dalition_h == 1 && dalition_w == 1) {
+            stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_DIRECT;
             shl_rvv_conv3x3s1_direct_reorder_kernel_pack4n_fp16(kernel, params);
             cb->exec = shl_rvv_conv3x3s1_direct_fp16_nhwc;
@@ -64,19 +69,27 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
 
     // packn
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv1x1s1_gemm_packn_fp16;
         } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-                   dalition_h == 1 && dalition_w == 1) {
-            if (params->group > 1) {
+                   dilation_h == 1 && dilation_w == 1) {
+            if (params->group > 1 || (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8)) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
                 if (!binary_model_op_init) {
-                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                        shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                        shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                    }
                 }
                 cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16;
                 return CSINN_TRUE;
@@ -96,7 +109,11 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
         } else {
             params->conv_extra.conv_mode = CSINN_GEMM;
             if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16;
         }
@@ -105,15 +122,23 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     // pack1ton
     if (in_elempack % packn != 0 && out_elempack % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp16;
         } else {
             if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp16;
         }
@@ -122,15 +147,23 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     // packnto1
     if (in_elempack % packn == 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp16;
         } else {
             if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp16;
         }
@@ -139,15 +172,23 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     // pack1
     if (in_elempack % packn != 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             if (!binary_model_op_init) {
-                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv1x1s1_gemm_fp16;
         } else {
             if (!binary_model_op_init) {
-                shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+                if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+                } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+                }
             }
             cb->exec = shl_rvv_conv_im2col_gemm_fp16;
         }
diff --git a/source/thead_rvv/fp16/convolution1d.c b/source/thead_rvv/fp16/convolution1d.c
new file mode 100644
index 00000000..eb95d864
--- /dev/null
+++ b/source/thead_rvv/fp16/convolution1d.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_conv1d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv1d_params *params)
+{
+    int32_t out_c = kernel->dim[0] / params->group;
+    int32_t in_c = kernel->dim[1];
+    int32_t in_w = input->dim[2];
+    int32_t kernel_w = kernel->dim[2];
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_w = params->dilation_width;
+
+    struct csinn_callback *cb = params->base.cb;
+
+    struct csinn_session *sess = params->base.sess;
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_rvv_option *option = shl_rvv_get_graph_option(sess);
+    }
+
+    bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
+
+    // pack1
+    if (!binary_model_op_init) {
+        if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+            shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16_w_int8(kernel, params);
+        } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+            shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16(kernel, params);
+        }
+    }
+    cb->exec = shl_rvv_conv1d_im2col_gemm_fp16;
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp16/convolution1d_gemm_fp16.c b/source/thead_rvv/fp16/convolution1d_gemm_fp16.c
new file mode 100644
index 00000000..b52fd3bb
--- /dev/null
+++ b/source/thead_rvv/fp16/convolution1d_gemm_fp16.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                    struct csinn_conv1d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;
+    int k = kernel->dim[1] * kernel->dim[2];
+
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    shl_mem_free(pa_reorder);
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                           struct csinn_conv1d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;
+    int k = kernel->dim[1] * kernel->dim[2];
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(group * m * k * sizeof(int8_t));
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp16_w_int8(kernel_data + g * m * k, pa_reorder + g * m * k, m, k,
+                                              k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
+/*************************************************************************************
+ * Per-channel dequantize int8 -> fp16
+ ************************************************************************************/
+void shl_rvv_conv1d_im2col_gemm_dequantize_per_channel_i8_to_f16(struct csinn_tensor *kernel,
+                                                                 struct csinn_conv1d_params *params,
+                                                                 __fp16 *kernel_fp16)
+{
+    int8_t *kernel_int8 = (int8_t *)kernel->data;
+    const int group = params->group;
+    const int m = kernel->dim[0] / group;
+    const int k = kernel->dim[1] * kernel->dim[2];
+    for (int g = 0; g < group; g++) {
+        int8_t *ksrc = kernel_int8 + g * m * k;
+        __fp16 *kdst = kernel_fp16 + g * m * k;
+        int i = 0;
+        int vl = 8;
+        for (; i + 7 < m; i += 8) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 4;
+        for (; i + 3 < m; i += 4) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 2;
+        for (; i + 1 < m; i += 2) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 1;
+        for (; i < m; i++) {
+            int oc = g * m + i;
+            int32_t zp = kernel->qinfo[oc].zero_point;
+            float scale = kernel->qinfo[oc].scale;
+            shl_rvv_dequantize_i8_to_f16(ksrc, kdst, k, zp, scale);
+        }
+    }
+}
+
+int shl_rvv_conv1d_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv1d_params *params)
+{
+    if (input->layout == CSINN_LAYOUT_NC1WC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
+    }
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = NULL;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t in_width = input->dim[2];
+
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_width = output->dim[2];
+
+    int32_t kernel_w = kernel->dim[2];
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t dilation_w = params->dilation_width;
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group * kernel_w;
+    int32_t n = out_width;
+
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv1d_im2col_gemm_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                        kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
+    __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // im2col
+            __fp16 *data_col = im2col_data;
+            __fp16 *channel_data = input_data;
+            for (int c = 0; c < in_ch / group; c++) {
+                for (int kw = 0; kw < kernel_w; kw++) {
+                    int in_col = -pad_left + kw * dilation_w;
+                    for (int ow1 = 0; ow1 < out_width; ow1++) {
+                        if (in_col < in_width && in_col >= 0) {
+                            *data_col++ = channel_data[in_col];
+                        } else {
+                            *data_col++ = 0.0f;
+                        }
+                        in_col += stride_w;
+                    }
+                }
+                channel_data += in_width;
+            }
+
+            __fp16 *pa = kernel_data + g * m * k;
+            __fp16 *pb = pb_reorder;
+            __fp16 *pc = output_data;
+
+            // pack
+            shl_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n);
+            // GEMM
+            shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            input_data += in_ch / group * in_width;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
+    // requantize
+    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
+    return CSINN_TRUE;
+}
\ No newline at end of file
diff --git a/source/thead_rvv/fp16/convolution_1x1_fp16.c b/source/thead_rvv/fp16/convolution_1x1_fp16.c
index 5a7deb6f..8a05cb3b 100644
--- a/source/thead_rvv/fp16/convolution_1x1_fp16.c
+++ b/source/thead_rvv/fp16/convolution_1x1_fp16.c
@@ -16,35 +16,33 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params)
 {
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
-    int group = params->group;
-
-    int m = kernel->dim[0] / group;  // out_ch
-    int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
+    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+}
 
-    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
-    for (int g = 0; g < group; g++) {
-        shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
-    }
-    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
-    shl_mem_free(pa_reorder);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params,
+                                     void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                     void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                  __fp16 *, int, int, int, int))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -58,6 +56,26 @@ int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16(kernel, params, kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
     for (int i = 0; i < batch; i++) {
@@ -67,16 +85,28 @@ int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor
             __fp16 *pc = output_data;
 
             // pack
-            shl_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n);
+            reorder_input(input_data, pb, k, n, n);
             // GEMM
-            shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k, n, n);
 
             input_data += k * n;
             output_data += m * n;
         }
     }
     shl_mem_free(pb_reorder);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_fp16(input, output, kernel, bias, params,
+                                            shl_rvv_reorder_input_z16_fp16, shl_rvv_gemm_8x16_fp16);
+}
diff --git a/source/thead_rvv/fp16/convolution_1x1_fp16_pack1ton.c b/source/thead_rvv/fp16/convolution_1x1_fp16_pack1ton.c
index 02aecba6..518ed8d9 100644
--- a/source/thead_rvv/fp16/convolution_1x1_fp16_pack1ton.c
+++ b/source/thead_rvv/fp16/convolution_1x1_fp16_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -28,9 +28,21 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *ke
     shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8(struct csinn_tensor *kernel,
+                                                                struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8(kernel, params);
+}
+
+int shl_rvv_common_conv1x1_gemm_pack1ton_fp16(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(__fp16 *, __fp16 *, int, int, int, int),
+    void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
@@ -44,7 +56,7 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csin
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -60,6 +72,27 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csin
     int32_t k = in_c / group;
     int32_t n = out_h * out_w;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_pack1ton_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                               kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
     __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
@@ -73,10 +106,10 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csin
             shl_rvv_reorder_input_pack1ton_fp16(input_data, input_ncxhwx, k, out_h, out_w);
 
             // reorder(pack)
-            shl_rvv_reorder_input_z12_pack1ton_fp16(input_ncxhwx, in_ptr, k, 1, n, n);
+            reorder_input(input_ncxhwx, in_ptr, k, 1, n, n);
 
             // gemm
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+            gemm(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, false);
 
             input_data += k * n;
             output_data += m * n;
@@ -84,7 +117,20 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csin
     }
     shl_mem_free(pb_reorder);
     shl_mem_free(input_ncxhwx);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_pack1ton_fp16(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_fp16,
+                                                     shl_rvv_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/thead_rvv/fp16/convolution_1x1_fp16_packn.c b/source/thead_rvv/fp16/convolution_1x1_fp16_packn.c
index be2f5030..14c6dda3 100644
--- a/source/thead_rvv/fp16/convolution_1x1_fp16_packn.c
+++ b/source/thead_rvv/fp16/convolution_1x1_fp16_packn.c
@@ -16,7 +16,13 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8(struct csinn_tensor *kernel,
+                                                             struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(kernel, params);
+}
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
                                                       struct csinn_conv2d_params *params)
@@ -24,9 +30,12 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kerne
     shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                      struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                           void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                        __fp16 *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
@@ -40,7 +49,7 @@ int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_t
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -54,6 +63,27 @@ int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_t
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_packn_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                            kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
     for (int i = 0; i < batch; i++) {
@@ -64,16 +94,29 @@ int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_t
             __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
 
             // pack
-            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
+            reorder_input(input_data, in_ptr, k, n, n);
             // GEMM
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+            gemm(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, false);
 
             input_data += k * n;
             output_data += m * n;
         }
     }
     shl_mem_free(pb_reorder);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packn_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp16,
+                                                  shl_rvv_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/thead_rvv/fp16/convolution_1x1_fp16_packnto1.c b/source/thead_rvv/fp16/convolution_1x1_fp16_packnto1.c
index 1cb7e48b..e64c4809 100644
--- a/source/thead_rvv/fp16/convolution_1x1_fp16_packnto1.c
+++ b/source/thead_rvv/fp16/convolution_1x1_fp16_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params)
@@ -24,16 +24,24 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *ke
     shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8(struct csinn_tensor *kernel,
+                                                                struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8(kernel, params);
+}
+
+int shl_rvv_common_conv1x1_gemm_packnto1_fp16(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+    void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -47,6 +55,27 @@ int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csin
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_packnto1_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                               kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
     __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
 
@@ -58,10 +87,9 @@ int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csin
             __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
 
             // pack
-            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
+            reorder_input(input_data, in_ptr, k, n, n);
             // GEMM
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                               n);
+            gemm(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, false);
 
             shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
 
@@ -71,7 +99,20 @@ int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csin
     }
     shl_mem_free(pb_reorder);
     shl_mem_free(output_ncxhwx);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packnto1_fp16(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_fp16,
+                                                     shl_rvv_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c b/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c
index c04333e9..c1ceefa8 100644
--- a/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c
+++ b/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/fp16/convolution_direct_fp16.c b/source/thead_rvv/fp16/convolution_direct_fp16.c
index d9523f97..1c483f0c 100644
--- a/source/thead_rvv/fp16/convolution_direct_fp16.c
+++ b/source/thead_rvv/fp16/convolution_direct_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * pack4n = vlenb / sizeof(__fp16) * 4
diff --git a/source/thead_rvv/fp16/convolution_gemm_fp16.c b/source/thead_rvv/fp16/convolution_gemm_fp16.c
index 05a13e74..d45e6258 100644
--- a/source/thead_rvv/fp16/convolution_gemm_fp16.c
+++ b/source/thead_rvv/fp16/convolution_gemm_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -39,16 +39,122 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // m = out_ch / group
+    int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(group * m * k * sizeof(int8_t));
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp16_w_int8(kernel_data + g * m * k, pa_reorder + g * m * k, m, k,
+                                              k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
+/*************************************************************************************
+ * Per-channel dequantize int8 -> fp16
+ ************************************************************************************/
+void shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16(struct csinn_tensor *kernel,
+                                                               struct csinn_conv2d_params *params,
+                                                               __fp16 *kernel_fp16)
+{
+    int8_t *kernel_int8 = (int8_t *)kernel->data;
+    const int group = params->group;
+    const int m = kernel->dim[0] / group;
+    int32_t kh = kernel->dim[2];
+    int32_t kw = kernel->dim_count == 4 ? kernel->dim[3] : 1;  // adapt conv1d1s1
+    const int k = kernel->dim[1] * kh * kw;
+    for (int g = 0; g < group; g++) {
+        int8_t *ksrc = kernel_int8 + g * m * k;
+        __fp16 *kdst = kernel_fp16 + g * m * k;
+        int i = 0;
+        int vl = 8;
+        for (; i + 7 < m; i += 8) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 4;
+        for (; i + 3 < m; i += 4) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 2;
+        for (; i + 1 < m; i += 2) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 1;
+        for (; i < m; i++) {
+            int oc = g * m + i;
+            int32_t zp = kernel->qinfo[oc].zero_point;
+            float scale = kernel->qinfo[oc].scale;
+            shl_rvv_dequantize_i8_to_f16(ksrc, kdst, k, zp, scale);
+        }
+    }
+}
+
+int shl_rvv_common_conv_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                  struct csinn_conv2d_params *params)
+                                  struct csinn_conv2d_params *params,
+                                  void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                  void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *,
+                                               int, int, int, int))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -72,6 +178,26 @@ int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tenso
     int32_t k = in_ch / group * ksize_h * ksize_w;
     int32_t n = out_height * out_width;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16(kernel, params, kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
     __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
@@ -113,16 +239,29 @@ int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tenso
             __fp16 *pc = output_data;
 
             // pack
-            shl_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n);
+            reorder_input(im2col_data, pb, k, n, n);
             // GEMM
-            shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k, n, n);
+
             input_data += in_ch / group * in_height * in_width;
             output_data += m * n;
         }
     }
     shl_mem_free(pb_reorder);
     shl_mem_free(im2col_data);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
-}
\ No newline at end of file
+}
+
+int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_fp16(input, output, kernel, bias, params,
+                                         shl_rvv_reorder_input_z16_fp16, shl_rvv_gemm_8x16_fp16);
+}
diff --git a/source/thead_rvv/fp16/convolution_gemm_fp16_pack1ton.c b/source/thead_rvv/fp16/convolution_gemm_fp16_pack1ton.c
index 6e64445d..7430e249 100644
--- a/source/thead_rvv/fp16/convolution_gemm_fp16_pack1ton.c
+++ b/source/thead_rvv/fp16/convolution_gemm_fp16_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(__fp16)
@@ -117,9 +117,117 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                           struct csinn_conv2d_params *params)
+/*************************************************************
+ * packn = vlenb / sizeof(__fp16)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn can != 0
+ * layout: [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n]
+ *         [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_pack1ton_per_group_fp16_w_int8(int8_t *src, int8_t *dst,
+                                                                      int out_c, int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e16m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < tail_c; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e16m1(packn);
+    // [out_c/packn, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < tail_c; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t));
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_pack1ton_per_group_fp16_w_int8(ker_ptr, ker_tm_ptr, out_cp, in_c,
+                                                                  maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
+void shl_rvv_conv_im2col_gemm_pack1ton_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16)
+{
+    shl_rvv_conv_im2col_gemm_packn_dequantize_per_channel_i8_to_f16(kernel, params, kernel_fp16);
+}
+
+int shl_rvv_common_conv_gemm_pack1ton_fp16(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(__fp16 *, __fp16 *, int, int, int, int),
+    void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *, __fp16 *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
@@ -133,7 +241,7 @@ int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct cs
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -157,6 +265,27 @@ int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct cs
     int32_t maxk = ksize_h * ksize_w;
     int32_t n = out_h * out_w;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_pack1ton_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                               kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
             // padding
@@ -207,21 +336,33 @@ int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct cs
 
             // reorder(pack)
             __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
-            shl_rvv_reorder_input_z12_pack1ton_fp16(im2col_buf, reorder_buf, in_cp, maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp, maxk, n, n);
             shl_mem_free(im2col_buf);
 
             // gemm
             __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
             __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
+            gemm(output_data, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n, false);
             shl_mem_free(reorder_buf);
 
             input_data += in_cp * in_h * in_w;
             output_data += m * n;
         }
     }
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_pack1ton_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_fp16,
+                                                  shl_rvv_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/thead_rvv/fp16/convolution_gemm_fp16_packn.c b/source/thead_rvv/fp16/convolution_gemm_fp16_packn.c
index 69c7b518..bc0ac8ca 100644
--- a/source/thead_rvv/fp16/convolution_gemm_fp16_packn.c
+++ b/source/thead_rvv/fp16/convolution_gemm_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(__fp16)
@@ -93,9 +93,145 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *ker
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+/*************************************************************
+ * packn = vlenb / sizeof(__fp16)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packn_per_group_fp16_w_int8(int8_t *src, int8_t *dst,
+                                                                   int out_c, int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e16m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e16m1(packn);
+    // [out_c/packn, in_c/packn, maxk, packn, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8(struct csinn_tensor *kernel,
+                                                               struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t));
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packn_per_group_fp16_w_int8(ker_ptr, ker_tm_ptr, out_cp, in_c,
+                                                               maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
+/*************************************************************************************
+ * Per-channel dequantize int8 -> fp16
+ ************************************************************************************/
+void shl_rvv_conv_im2col_gemm_packn_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16)
+{
+    int8_t *kernel_int8 = (int8_t *)kernel->data;
+    int group = params->group;
+    const int m = kernel->dim[0] / group;
+    const int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ksrc = kernel_int8 + g * m * k;
+        __fp16 *kdst = kernel_fp16 + g * m * k;
+        int i = 0;
+        int vl = vsetvl_e16m2(pack2n);
+        for (; i + pack2n - 1 < m; i += pack2n) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = vsetvl_e16m1(packn);
+        for (; i + packn - 1 < m; i += packn) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+    }
+}
+
+int shl_rvv_common_conv_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                        struct csinn_conv2d_params *params)
+                                        struct csinn_conv2d_params *params,
+                                        void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                        void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                     __fp16 *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
@@ -109,7 +245,7 @@ int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -133,6 +269,27 @@ int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn
     int32_t maxk = ksize_h * ksize_w;
     int32_t n = out_h * out_w;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_packn_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                            kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
             // padding
@@ -176,21 +333,33 @@ int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn
 
             // reorder(pack)
             __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
-            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp * maxk, n, n);
             shl_mem_free(im2col_buf);
 
             // gemm
             __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
             __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
+            gemm(output_data, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n, false);
             shl_mem_free(reorder_buf);
 
             input_data += in_cp * in_h * in_w;
             output_data += m * n;
         }
     }
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packn_fp16(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_fp16,
+                                               shl_rvv_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/thead_rvv/fp16/convolution_gemm_fp16_packnto1.c b/source/thead_rvv/fp16/convolution_gemm_fp16_packnto1.c
index 4215d730..79fde62d 100644
--- a/source/thead_rvv/fp16/convolution_gemm_fp16_packnto1.c
+++ b/source/thead_rvv/fp16/convolution_gemm_fp16_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(__fp16)
@@ -111,16 +111,188 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+/*************************************************************
+ * packn = vlenb / sizeof(__fp16)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn != 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ *         [out_c/tail, in_c/packnb, maxk, packnb, tail]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packnto1_per_group_fp16_w_int8(int8_t *src, int8_t *dst,
+                                                                      int out_c, int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e16m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e16m1(packn);
+    // [out_c/packn, in_c/packn, maxk, packn, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    // [out_c/tail, in_c/packnb, maxk, packnb, tail]
+    if (oc < out_c) {
+        vl = vsetvl_e16m1(out_c - oc);
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vint8m1_t _tmp =
+                        vlse8_v_i8m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(int8_t), vl);
+                    vse8_v_i8m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t));
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packnto1_per_group_fp16_w_int8(ker_ptr, ker_tm_ptr, out_cp, in_c,
+                                                                  maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
+/*************************************************************************************
+ * Per-channel dequantize int8 -> fp16
+ ************************************************************************************/
+void shl_rvv_conv_im2col_gemm_packnto1_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16)
+{
+    int8_t *kernel_int8 = (int8_t *)kernel->data;
+    int group = params->group;
+    const int m = kernel->dim[0] / group;
+    const int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ksrc = kernel_int8 + g * m * k;
+        __fp16 *kdst = kernel_fp16 + g * m * k;
+        int i = 0;
+        int vl = vsetvl_e16m2(pack2n);
+        for (; i + pack2n - 1 < m; i += pack2n) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = vsetvl_e16m1(packn);
+        for (; i + packn - 1 < m; i += packn) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = vsetvl_e16m1(m - i);
+        if (i < m) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+    }
+}
+
+int shl_rvv_common_conv_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                           struct csinn_conv2d_params *params)
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(__fp16 *, __fp16 *, int, int, int),
+                                           void (*gemm)(__fp16 *, const __fp16 *, const __fp16 *,
+                                                        __fp16 *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(input);
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
@@ -143,6 +315,27 @@ int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct cs
     int32_t maxk = ksize_h * ksize_w;
     int32_t n = out_h * out_w;
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_conv_im2col_gemm_packnto1_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                               kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
 
     for (int i = 0; i < batch; i++) {
@@ -188,14 +381,13 @@ int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct cs
 
             // reorder(pack)
             __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
-            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp * maxk, n, n);
             shl_mem_free(im2col_buf);
 
             // gemm
             __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
             __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
+            gemm(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n, false);
             shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
 
             shl_mem_free(reorder_buf);
@@ -205,7 +397,20 @@ int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct cs
         }
     }
     shl_mem_free(output_ncxhwx);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packnto1_fp16(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp16,
+                                                  shl_rvv_ncxhwx_gemm_12xpack2n_fp16);
+}
diff --git a/source/thead_rvv/fp16/deconvolution.c b/source/thead_rvv/fp16/deconvolution.c
new file mode 100644
index 00000000..10227ae6
--- /dev/null
+++ b/source/thead_rvv/fp16/deconvolution.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+#include "shl_debug.h"
+
+int shl_rvv_deconv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params)
+{
+    struct csinn_callback *cb = params->base.cb;
+
+    struct csinn_session *sess = params->base.sess;
+
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_rvv_option *option = shl_rvv_get_graph_option(sess);
+    }
+
+    bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
+
+    // pack1
+    params->conv_extra.conv_mode = CSINN_GEMM;
+
+    if (!binary_model_op_init) {
+        if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+            shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16_w_int8(kernel, params);
+        } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+            shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16(kernel, params);
+        }
+    }
+    cb->exec = shl_rvv_deconv2d_gemm_col2im_fp16;
+
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp16/deconvolution_gemm_fp16.c b/source/thead_rvv/fp16/deconvolution_gemm_fp16.c
new file mode 100644
index 00000000..d4ed45a7
--- /dev/null
+++ b/source/thead_rvv/fp16/deconvolution_gemm_fp16.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static void transpose_10_fp16(__fp16 *src, __fp16 *dst, int inner_size, int outer_size)
+{
+    for (int i = 0; i < outer_size; i++) {
+        int size = inner_size;
+        __fp16 *d_ptr = dst + i;
+        while (size > 0) {
+            int vl = vsetvl_e16m4(size);
+            vfloat16m4_t _in = vle16_v_f16m4(src, vl);
+            src += vl;
+            vsse16_v_f16m4(d_ptr, outer_size * sizeof(__fp16), _in, vl);
+            d_ptr += vl * outer_size;
+            size -= vl;
+        }
+    }
+}
+
+static void transpose_10_int8(int8_t *src, int8_t *dst, int inner_size, int outer_size)
+{
+    for (int i = 0; i < outer_size; i++) {
+        int size = inner_size;
+        int8_t *d_ptr = dst + i;
+        while (size > 0) {
+            int vl = vsetvl_e8m4(size);
+            vint8m4_t _in = vle8_v_i8m4(src, vl);
+            src += vl;
+            vsse8_v_i8m4(d_ptr, outer_size * sizeof(int8_t), _in, vl);
+            d_ptr += vl * outer_size;
+            size -= vl;
+        }
+    }
+}
+
+// Kernel:[IC,OC,KH,KW] --> [OC,KH,KW,IC]
+void shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *data_buf = shl_mem_alloc(kernel->dim[0] * kernel->dim[1] * kernel->dim[2] *
+                                     kernel->dim[3] * sizeof(__fp16));
+
+    int inner_size = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    int outer_size = kernel->dim[0];
+
+    transpose_10_fp16(kernel_data, data_buf, inner_size, outer_size);
+
+    int group = params->group;
+
+    int k = kernel->dim[0];
+    int m = kernel->dim[1] * kernel->dim[2] * kernel->dim[3] / group;
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp16(data_buf + g * m * k, kernel_data + g * m * k, m, k, k);
+    }
+    shl_mem_free(data_buf);
+}
+
+void shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16_w_int8(struct csinn_tensor *kernel,
+                                                             struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int8_t *data_buf = shl_mem_alloc(kernel->dim[0] * kernel->dim[1] * kernel->dim[2] *
+                                     kernel->dim[3] * sizeof(int8_t));
+
+    int inner_size = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    int outer_size = kernel->dim[0];
+
+    transpose_10_int8(kernel_data, data_buf, inner_size, outer_size);
+
+    int group = params->group;
+
+    int k = kernel->dim[0];
+    int m = kernel->dim[1] * kernel->dim[2] * kernel->dim[3] / group;
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp16_w_int8(data_buf + g * m * k, kernel_data + g * m * k, m, k,
+                                              k);
+    }
+    shl_mem_free(data_buf);
+}
+
+/*************************************************************************************
+ * Per-channel dequantize int8 -> fp16
+ ************************************************************************************/
+void shl_rvv_deconv2d_gemm_col2im_dequantize_per_channel_i8_to_f16(
+    struct csinn_tensor *kernel, struct csinn_conv2d_params *params, __fp16 *kernel_fp16)
+{
+    int8_t *kernel_int8 = (int8_t *)kernel->data;
+    const int group = params->group;
+    const int m = kernel->dim[1] * kernel->dim[2] * kernel->dim[3] / group;
+    const int k = kernel->dim[0];
+    for (int g = 0; g < group; g++) {
+        int8_t *ksrc = kernel_int8 + g * m * k;
+        __fp16 *kdst = kernel_fp16 + g * m * k;
+        int i = 0;
+        int vl = 8;
+        for (; i + 7 < m; i += 8) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 4;
+        for (; i + 3 < m; i += 4) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 2;
+        for (; i + 1 < m; i += 2) {
+            int oc = g * m + i;
+            vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                             sizeof(struct csinn_quant_info), vl);
+            vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+            vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+            vfloat32m4_t _s32 =
+                vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+            vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+            for (int j = 0; j < k; j++) {
+                vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                vse16_v_f16m2(kdst, _f16, vl);
+                ksrc += vl;
+                kdst += vl;
+            }
+        }
+        vl = 1;
+        for (; i < m; i++) {
+            int oc = g * m + i;
+            int32_t zp = kernel->qinfo[oc].zero_point;
+            float scale = kernel->qinfo[oc].scale;
+            shl_rvv_dequantize_i8_to_f16(ksrc, kdst, k, zp, scale);
+        }
+    }
+}
+
+//判断a<b
+inline static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); }
+
+static void col2im_cpu_ext(const __fp16 *data_col, const __fp16 *bias, const int batch,
+                           const int channels, const int height, const int width,
+                           const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w, const int dilation_h,
+                           const int dilation_w, __fp16 *data_im)
+{
+    const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+    const int channel_size = height * width;
+    const int batch_size = channels * height * width;
+
+    int channel, kernel_row, kernel_col, output_rows, output_col;
+    for (int b = 0; b < batch; b++) {
+        for (channel = 0; channel < channels; channel++) {
+            for (int i = 0; i < channel_size; i++) {
+                data_im[i] = (!bias) ? 0.0 : bias[channel];
+            }
+            for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+                for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+                    int input_row = -pad_h + kernel_row * dilation_h;
+                    for (output_rows = output_h; output_rows; output_rows--) {
+                        if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+                            data_col += output_w;
+                        } else {
+                            int input_col = -pad_w + kernel_col * dilation_w;
+                            for (output_col = output_w; output_col; output_col--) {
+                                if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                                    data_im[input_row * width + input_col] += *data_col;
+                                }
+                                data_col++;
+                                input_col += stride_w;
+                            }
+                        }
+                        input_row += stride_h;
+                    }
+                }
+            }
+            data_im += channel_size;
+        }
+    }
+}
+
+// Data format : NCHW  Input:[N,IC,IH,IW] Kernel:[OC,KH,KW,IC] Output:[N,OC,OH,OW]
+int shl_rvv_deconv2d_gemm_col2im_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
+        shl_debug_info("Data Format: NC1HWC0\n");
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
+    } else if (input->layout != CSINN_LAYOUT_NCHW) {
+        shl_debug_error("Unsupported data format\n");
+        return CSINN_FALSE;
+    }
+
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = NULL;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+
+    int32_t group = params->group;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_t = params->pad_top;
+    int32_t pad_l = params->pad_left;
+    int32_t pad_d = params->pad_down;
+    int32_t pad_r = params->pad_right;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
+    int32_t out_pad_h = params->out_pad_height;
+    int32_t out_pad_w = params->out_pad_width;
+
+    int32_t m = out_c / group * kernel_h * kernel_w;
+    int32_t k = in_c / group;
+    int32_t n = in_h * in_w;
+
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            shl_rvv_deconv2d_gemm_col2im_dequantize_per_channel_i8_to_f16(kernel, params,
+                                                                          kernel_fp16);
+        } else {
+            int8_t *kernel_int8 = (int8_t *)kernel->data;
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
+    __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *output_buf = (__fp16 *)shl_mem_alloc(batch * group * m * n * sizeof(__fp16));
+    const int vlen = csrr_vlenb() * 8;
+
+    __fp16 *output_buf_ptr = output_buf;
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            if (vlen == 128) {
+                // Pack
+                shl_rvv_reorder_input_z16_fp16(input_data, reorder_buf, k, n, n);
+                // Gemm
+                shl_rvv_gemm_8x16_fp16(output_buf_ptr, (kernel_data + g * m * k), reorder_buf, NULL,
+                                       m, k, n, n);
+            } else {
+                shl_debug_error("The vector length is temporarily not supported.");
+            }
+            input_data += k * n;
+            output_buf_ptr += m * n;
+        }
+    }
+    shl_mem_free(reorder_buf);
+
+    col2im_cpu_ext(output_buf, bias_data, batch, out_c, out_h, out_w, kernel_h, kernel_w, pad_t,
+                   pad_l, stride_h, stride_w, dilation_h, dilation_w, output_data);
+
+    shl_mem_free(output_buf);
+
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
+
+    // requantize
+    shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
+    return CSINN_TRUE;
+}
\ No newline at end of file
diff --git a/source/thead_rvv/fp16/depthwise_convolution.c b/source/thead_rvv/fp16/depthwise_convolution.c
index bb56f249..fd22a602 100644
--- a/source/thead_rvv/fp16/depthwise_convolution.c
+++ b/source/thead_rvv/fp16/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -46,13 +46,20 @@ int shl_rvv_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_
             in_elempack = 1;
             out_elempack = 1;  // dwconv2d out_channel pack is same as in_channel
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
 
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
         if (!binary_model_op_init) {
-            shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params);
+            if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_dwconv_reorder_kernel_packn_fp16_w_int8(kernel, params);
+            } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+                shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params);
+            }
         }
         if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
             cb->exec = shl_rvv_dwconv3x3s1_packn_fp16;
diff --git a/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16.c b/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16.c
index e2f59449..b64f5299 100644
--- a/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16.c
+++ b/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -30,7 +30,7 @@ int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t batch = input->dim[0];
@@ -42,6 +42,32 @@ int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        int8_t *kernel_int8 = (int8_t *)kernel->data;
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2] * kernel->dim[3];
+            for (int c = 0; c < in_c; c++) {
+                int32_t zp = kernel->qinfo[c].zero_point;
+                float scale = kernel->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(kernel_int8 + c * maxk, kernel_fp16 + c * maxk, maxk,
+                                             zp, scale);
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *input_padd_buf =
         (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(__fp16));
@@ -340,6 +366,10 @@ int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
         }
     }
     shl_mem_free(input_padd_buf);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
@@ -354,7 +384,7 @@ int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t batch = input->dim[0];
@@ -366,6 +396,32 @@ int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        int8_t *kernel_int8 = (int8_t *)kernel->data;
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2] * kernel->dim[3];
+            for (int c = 0; c < in_c; c++) {
+                int32_t zp = kernel->qinfo[c].zero_point;
+                float scale = kernel->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(kernel_int8 + c * maxk, kernel_fp16 + c * maxk, maxk,
+                                             zp, scale);
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *input_padd_buf =
         (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(__fp16));
@@ -515,6 +571,10 @@ int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *ou
     }
 
     shl_mem_free(input_padd_buf);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
diff --git a/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16_packn.c b/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16_packn.c
index 7a96b4b2..af730c7b 100644
--- a/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16_packn.c
+++ b/source/thead_rvv/fp16/depthwise_convolution_3x3_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ... flexible vlen
@@ -36,7 +36,7 @@ int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tens
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t batch = input->dim[0];
@@ -49,6 +49,44 @@ int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tens
     const int packn = csrr_vlenb() / sizeof(__fp16);
     const int vl = vsetvl_e16m1(packn);
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        int8_t *kernel_int8 = (int8_t *)kernel->data;
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2] * kernel->dim[3];
+            for (int oc = 0; oc + packn - 1 < in_c; oc += packn) {
+                int8_t *ksrc = kernel_int8 + oc * maxk;
+                __fp16 *kdst = kernel_fp16 + oc * maxk;
+                vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                                 sizeof(struct csinn_quant_info), vl);
+                vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+                vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+                vfloat32m4_t _s32 =
+                    vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+                vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+                for (int k = 0; k < maxk; k++) {
+                    vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                    vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                    vse16_v_f16m2(kdst, _f16, vl);
+                    ksrc += vl;
+                    kdst += vl;
+                }
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *input_padd_buf =
         (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(__fp16));
@@ -534,6 +572,10 @@ int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tens
         }
     }
     shl_mem_free(input_padd_buf);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
@@ -554,7 +596,7 @@ int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tens
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t batch = input->dim[0];
@@ -567,6 +609,44 @@ int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tens
     const int packn = csrr_vlenb() / sizeof(__fp16);
     const int vl = vsetvl_e16m1(packn);
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        int8_t *kernel_int8 = (int8_t *)kernel->data;
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2] * kernel->dim[3];
+            for (int oc = 0; oc + packn - 1 < in_c; oc += packn) {
+                int8_t *ksrc = kernel_int8 + oc * maxk;
+                __fp16 *kdst = kernel_fp16 + oc * maxk;
+                vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                                 sizeof(struct csinn_quant_info), vl);
+                vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+                vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+                vfloat32m4_t _s32 =
+                    vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+                vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+                for (int k = 0; k < maxk; k++) {
+                    vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                    vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                    vse16_v_f16m2(kdst, _f16, vl);
+                    ksrc += vl;
+                    kdst += vl;
+                }
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *input_padd_buf =
         (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(__fp16));
@@ -784,6 +864,10 @@ int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tens
         }
     }
     shl_mem_free(input_padd_buf);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
@@ -812,3 +896,27 @@ void shl_rvv_dwconv_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
     memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(__fp16));
     shl_mem_free(kernel_trans);
 }
+
+void shl_rvv_dwconv_reorder_kernel_packn_fp16_w_int8(struct csinn_tensor *kernel,
+                                                     struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    const int out_ch = kernel->dim[0];
+    const int maxk = kernel->dim[2] * kernel->dim[3];
+    int8_t *kernel_trans = (int8_t *)shl_mem_alloc(out_ch * maxk * sizeof(int8_t));
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) {
+        int8_t *ksrc = kernel_data + oc * maxk;
+        int8_t *kdst = kernel_trans + oc * maxk;
+        for (int ic = 0; ic < maxk; ic++) {
+            vint8m1_t _tmp = vlse8_v_i8m1(ksrc + ic, maxk * sizeof(int8_t), vl);
+            vse8_v_i8m1(kdst, _tmp, vl);
+            kdst += vl;
+        }
+    }
+    memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(int8_t));
+    shl_mem_free(kernel_trans);
+}
\ No newline at end of file
diff --git a/source/thead_rvv/fp16/depthwise_convolution_fp16_nhwc.c b/source/thead_rvv/fp16/depthwise_convolution_fp16_nhwc.c
index a1ba2f13..d50c0b78 100644
--- a/source/thead_rvv/fp16/depthwise_convolution_fp16_nhwc.c
+++ b/source/thead_rvv/fp16/depthwise_convolution_fp16_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/fp16/depthwise_convolution_fp16_packn.c b/source/thead_rvv/fp16/depthwise_convolution_fp16_packn.c
index 3a87080d..b554a494 100644
--- a/source/thead_rvv/fp16/depthwise_convolution_fp16_packn.c
+++ b/source/thead_rvv/fp16/depthwise_convolution_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
@@ -36,7 +36,7 @@ int shl_rvv_dwconv_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     }
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *kernel_data = NULL;
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int batch = input->dim[0];
@@ -58,6 +58,44 @@ int shl_rvv_dwconv_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *o
     const int packn = csrr_vlenb() / sizeof(__fp16);
     const int vl = vsetvl_e16m1(packn);
 
+    __fp16 *kernel_fp16 = NULL;
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        int size = csinn_tensor_size(kernel);
+        int8_t *kernel_int8 = (int8_t *)kernel->data;
+        kernel_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
+        if (kernel->quant_channel > 1) {
+            const int maxk = kernel->dim[2] * kernel->dim[3];
+            for (int oc = 0; oc + packn - 1 < in_c; oc += packn) {
+                int8_t *ksrc = kernel_int8 + oc * maxk;
+                __fp16 *kdst = kernel_fp16 + oc * maxk;
+                vint32m4_t _z32 = vlse32_v_i32m4(&(kernel->qinfo[oc].zero_point),
+                                                 sizeof(struct csinn_quant_info), vl);
+                vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl);
+                vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl);
+                vfloat32m4_t _s32 =
+                    vlse32_v_f32m4(&(kernel->qinfo[oc].scale), sizeof(struct csinn_quant_info), vl);
+                vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl);
+                for (int k = 0; k < maxk; k++) {
+                    vint8m1_t _i8 = vle8_v_i8m1(ksrc, vl);
+                    vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl);
+                    vse16_v_f16m2(kdst, _f16, vl);
+                    ksrc += vl;
+                    kdst += vl;
+                }
+            }
+        } else {
+            int32_t zp = kernel->qinfo->zero_point;
+            float scale = kernel->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(kernel_int8, kernel_fp16, size, zp, scale);
+        }
+        kernel_data = kernel_fp16;
+    } else if (kernel->dtype == CSINN_DTYPE_FLOAT16) {
+        kernel_data = (__fp16 *)kernel->data;
+    } else {
+        shl_debug_error("kernel unsupport dtype: %d\n", kernel->dtype);
+        return CSINN_FALSE;
+    }
+
     __fp16 *input_padd_buf =
         (__fp16 *)shl_mem_alloc(in_c * padded_in_h * padded_in_w * sizeof(__fp16));
 
@@ -98,6 +136,10 @@ int shl_rvv_dwconv_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *o
         output_data += out_c * out_h * out_w;
     }
     shl_mem_free(input_padd_buf);
+    if (kernel->is_const && kernel->dtype == CSINN_DTYPE_INT8) {
+        shl_mem_free(kernel_fp16);
+        return CSINN_TRUE;
+    }
     // requantize
     shl_rvv_sidcso_op_requantize_fp16(input, output, kernel);
     return CSINN_TRUE;
diff --git a/source/thead_rvv/fp16/div.c b/source/thead_rvv/fp16/div.c
new file mode 100644
index 00000000..15495530
--- /dev/null
+++ b/source/thead_rvv/fp16/div.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static inline void div_vv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _b = vle16_v_f16m4(in1, vl);
+        vfloat16m4_t _c = vfdiv_vv_f16m4(_a, _b, vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void div_vf_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _c = vfdiv_vf_f16m4(_a, in1[0], vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void div_fv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _b = vle16_v_f16m4(in1, vl);
+        vfloat16m4_t _c = vfrdiv_vf_f16m4(_b, in0[0], vl);
+        vse16_v_f16m4(out, _c, vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *div_cb_fp16[] = {
+    [CSINN_BROADCAST_VV] = div_vv_f16m4,
+    [CSINN_BROADCAST_VS] = div_vf_f16m4,
+    [CSINN_BROADCAST_SV] = div_fv_f16m4,
+};
+
+int shl_rvv_div_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    return shl_rvv_binary_op_broadcast_fp16(input0, input1, output, div_cb_fp16);
+}
diff --git a/source/thead_rvv/fp16/erf.c b/source/thead_rvv/fp16/erf.c
new file mode 100644
index 00000000..f7011c27
--- /dev/null
+++ b/source/thead_rvv/fp16/erf.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+#define a1 0.0705230784
+#define a2 0.0422820123
+#define a3 0.0092705272
+#define a4 0.0001520143
+#define a5 0.0002765672
+#define a6 0.0000430638
+
+static inline vfloat16m4_t vfpow16_v_f16m4(vfloat16m4_t _x, int vl)
+{
+    vfloat16m4_t _x2 = vfmul_vv_f16m4(_x, _x, vl);
+    vfloat16m4_t _x4 = vfmul_vv_f16m4(_x2, _x2, vl);
+    vfloat16m4_t _x8 = vfmul_vv_f16m4(_x4, _x4, vl);
+    vfloat16m4_t _x16 = vfmul_vv_f16m4(_x8, _x8, vl);
+    return _x16;
+}
+
+/*************************************************************************************
+ * erf(x) = 1 - 1 / (1 + a1*x + a2*x^2 + a3*x^3 + a4*x^4 + a5*x^5 + a6*x^6)^16
+ **************************************************************************************/
+int shl_rvv_erf_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    int size = csinn_tensor_size(input);
+
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _x = vle16_v_f16m4(input_data, vl);
+        input_data += vl;
+
+        vbool4_t _mask = vmflt_vf_f16m4_b4(_x, 0.0f, vl);
+        _x = vfmul_vf_f16m4_m(_mask, _x, _x, -1.0f, vl);
+
+        vfloat16m4_t _x2 = vfmul_vv_f16m4(_x, _x, vl);
+        vfloat16m4_t _x3 = vfmul_vv_f16m4(_x2, _x, vl);
+        vfloat16m4_t _x4 = vfmul_vv_f16m4(_x2, _x2, vl);
+        vfloat16m4_t _x5 = vfmul_vv_f16m4(_x3, _x2, vl);
+        vfloat16m4_t _x6 = vfmul_vv_f16m4(_x3, _x3, vl);
+        _x = vfmul_vf_f16m4(_x, a1, vl);
+        _x2 = vfmul_vf_f16m4(_x2, a2, vl);
+        _x3 = vfmul_vf_f16m4(_x3, a3, vl);
+        _x4 = vfmul_vf_f16m4(_x4, a4, vl);
+        _x5 = vfmul_vf_f16m4(_x5, a5, vl);
+        _x6 = vfmul_vf_f16m4(_x6, a6, vl);
+
+        vfloat16m4_t _t = vfmv_v_f_f16m4(1.0f, vl);
+        _t = vfadd_vv_f16m4(_t, _x, vl);
+        _t = vfadd_vv_f16m4(_t, _x2, vl);
+        _t = vfadd_vv_f16m4(_t, _x3, vl);
+        _t = vfadd_vv_f16m4(_t, _x4, vl);
+        _t = vfadd_vv_f16m4(_t, _x5, vl);
+        _t = vfadd_vv_f16m4(_t, _x6, vl);
+
+        vfloat16m4_t _pow = vfpow16_v_f16m4(_t, vl);
+        vfloat16m4_t _y = vfrdiv_vf_f16m4(_pow, -1.0f, vl);
+        _y = vfadd_vf_f16m4(_y, 1.0f, vl);
+        _y = vfmul_vf_f16m4_m(_mask, _y, _y, -1.0f, vl);
+
+        vse16_v_f16m4(output_data, _y, vl);
+        output_data += vl;
+        size -= vl;
+    }
+
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp16/fullyconnected.c b/source/thead_rvv/fp16/fullyconnected.c
index 67b178c0..f88d1bfe 100644
--- a/source/thead_rvv/fp16/fullyconnected.c
+++ b/source/thead_rvv/fp16/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_fullyconnected_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/thead_rvv/fp16/fullyconnected_fp16.c b/source/thead_rvv/fp16/fullyconnected_fp16.c
index ed128fa0..befa51da 100644
--- a/source/thead_rvv/fp16/fullyconnected_fp16.c
+++ b/source/thead_rvv/fp16/fullyconnected_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -89,6 +89,10 @@ int shl_rvv_fullyconnected_packn_fp16(struct csinn_tensor *input, struct csinn_t
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
                                       struct csinn_fc_params *params)
 {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
+    }
+
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
     __fp16 *weights_data = (__fp16 *)weights->data;
@@ -115,13 +119,23 @@ int shl_rvv_fullyconnected_packn_fp16(struct csinn_tensor *input, struct csinn_t
 
     __fp16 *weights_fp16 = NULL;
     if (weights->is_const && weights->dtype == CSINN_DTYPE_INT8) {
-        // TODO: support per-channel quantization
-        int32_t zp = weights->qinfo->zero_point;
-        float scale = weights->qinfo->scale;
         int size = csinn_tensor_size(weights);
         int8_t *weights_int8 = (int8_t *)weights->data;
         weights_fp16 = (__fp16 *)shl_mem_alloc(size * sizeof(__fp16));
-        shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale);
+        if (weights->quant_channel == 1) {
+            int32_t zp = weights->qinfo->zero_point;
+            float scale = weights->qinfo->scale;
+            shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale);
+        } else if (weights->quant_channel == output_depth) {
+            // support channel quantization
+            for (int c = 0; c < output_depth; c++) {
+                int32_t zp = weights->qinfo[c].zero_point;
+                float scale = weights->qinfo[c].scale;
+                shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth,
+                                             weights_fp16 + c * accum_depth, accum_depth, zp,
+                                             scale);
+            }
+        }
         weights_data = weights_fp16;
     } else if (weights->dtype == CSINN_DTYPE_FLOAT16) {
         weights_data = (__fp16 *)weights->data;
diff --git a/source/thead_rvv/fp16/gather.c b/source/thead_rvv/fp16/gather.c
index ce88e78b..13fa5b43 100644
--- a/source/thead_rvv/fp16/gather.c
+++ b/source/thead_rvv/fp16/gather.c
@@ -16,11 +16,15 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_gather_fp16(struct csinn_tensor *input, struct csinn_tensor *indices,
                         struct csinn_tensor *output, struct csinn_gather_params *params)
 {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
+    }
+
     int input_size = csinn_tensor_size(input);
     if (input_size == 0) {
         return CSINN_TRUE;
diff --git a/source/thead_rvv/fp16/gemm_fp16.c b/source/thead_rvv/fp16/gemm_fp16.c
index cc583ece..136ea068 100644
--- a/source/thead_rvv/fp16/gemm_fp16.c
+++ b/source/thead_rvv/fp16/gemm_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /************************************************************************
  * input matrix and kernel matrix have been reordered
diff --git a/source/thead_rvv/fp16/gemm_fp16_block.c b/source/thead_rvv/fp16/gemm_fp16_block.c
index 01078622..8eef76f1 100644
--- a/source/thead_rvv/fp16/gemm_fp16_block.c
+++ b/source/thead_rvv/fp16/gemm_fp16_block.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(__fp16)
@@ -677,8 +677,8 @@ static inline void gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp
 
 /*************************************************************
  * packn = vlenb / sizeof(__fp16)
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 12
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * m_blk: M_BLK, M_tail
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  *
  * dst - output: [m, n]
@@ -700,28 +700,17 @@ void shl_rvv_gemm_block_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp
         bias = (__fp16 *)shl_mem_alloc(m * sizeof(__fp16));
     }
 
-    const int packn = csrr_vlenb() / sizeof(__fp16);
-
-    const int MIN_M_BLK = 12;
-    const int MIN_N_BLK = packn * 2;
-
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
 
         int n_block = N_BLK;
         int n_idx = 0;
         while (n_idx < n) {
-            while (!(n_idx + n_block - 1 < n)) {
-                n_block /= 2;
-            }
-            if (n_block < MIN_N_BLK) {
+            if (n - n_idx < n_block) {
                 n_block = n - n_idx;
             }
 
diff --git a/source/thead_rvv/fp16/gemm_fp16_packn.c b/source/thead_rvv/fp16/gemm_fp16_packn.c
index 2ce6e717..6920cd0b 100644
--- a/source/thead_rvv/fp16/gemm_fp16_packn.c
+++ b/source/thead_rvv/fp16/gemm_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
@@ -28,8 +28,9 @@
  * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
  * sb - input:   [n/8, k, 8]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                       __fp16 *bias, int m, int k, int n, int ldc)
+                                       __fp16 *bias, int m, int k, int n, bool fuse_relu)
 {
     __fp16 *kernel_data = (__fp16 *)sa;
     __fp16 *input_data = (__fp16 *)sb;
@@ -397,8 +398,9 @@ void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp
  * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
  * sb - input:   [n/12, k, 12]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
-                                        __fp16 *bias, int m, int k, int n, int ldc)
+                                        __fp16 *bias, int m, int k, int n, bool fuse_relu)
 {
     __fp16 *kernel_data = (__fp16 *)sa;
     __fp16 *input_data = (__fp16 *)sb;
diff --git a/source/thead_rvv/fp16/global_avgpool.c b/source/thead_rvv/fp16/global_avgpool.c
index 789b3ab9..d18ead02 100644
--- a/source/thead_rvv/fp16/global_avgpool.c
+++ b/source/thead_rvv/fp16/global_avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/global_avgpool_nhwc.c b/source/thead_rvv/fp16/global_avgpool_nhwc.c
index 36beb17b..1bb81e1d 100644
--- a/source/thead_rvv/fp16/global_avgpool_nhwc.c
+++ b/source/thead_rvv/fp16/global_avgpool_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/global_avgpool_packn.c b/source/thead_rvv/fp16/global_avgpool_packn.c
index 3c400be8..42eecea0 100644
--- a/source/thead_rvv/fp16/global_avgpool_packn.c
+++ b/source/thead_rvv/fp16/global_avgpool_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_global_avgpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_pool_params *params)
diff --git a/source/thead_rvv/fp16/global_maxpool.c b/source/thead_rvv/fp16/global_maxpool.c
index 9567fb6e..a3ba43a2 100644
--- a/source/thead_rvv/fp16/global_maxpool.c
+++ b/source/thead_rvv/fp16/global_maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/global_maxpool_nhwc.c b/source/thead_rvv/fp16/global_maxpool_nhwc.c
index 1f6c607c..fdd8e0ce 100644
--- a/source/thead_rvv/fp16/global_maxpool_nhwc.c
+++ b/source/thead_rvv/fp16/global_maxpool_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/global_maxpool_packn.c b/source/thead_rvv/fp16/global_maxpool_packn.c
index 78e3351e..5252929c 100644
--- a/source/thead_rvv/fp16/global_maxpool_packn.c
+++ b/source/thead_rvv/fp16/global_maxpool_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/fp16/layer_norm.c b/source/thead_rvv/fp16/layer_norm.c
index be39df6c..68ad1d3c 100644
--- a/source/thead_rvv/fp16/layer_norm.c
+++ b/source/thead_rvv/fp16/layer_norm.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: support flexible vlen
diff --git a/source/thead_rvv/fp16/leaky_relu.c b/source/thead_rvv/fp16/leaky_relu.c
index 8c05daf3..4407632d 100644
--- a/source/thead_rvv/fp16/leaky_relu.c
+++ b/source/thead_rvv/fp16/leaky_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/fp16/matmul.c b/source/thead_rvv/fp16/matmul.c
index 0cad43ae..447082ac 100644
--- a/source/thead_rvv/fp16/matmul.c
+++ b/source/thead_rvv/fp16/matmul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 #define MATMUL_M_BLK 64
 #define MATMUL_K_BLK 64
@@ -26,6 +26,13 @@ int shl_rvv_matmul_block_fp16(struct csinn_tensor *mat0, struct csinn_tensor *ma
                               struct csinn_tensor *output, struct csinn_matmul_params *params,
                               const int M_BLK, const int K_BLK, const int N_BLK)
 {
+    if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0);
+    }
+    if (mat1->layout >= CSINN_LAYOUT_NC1C0 && mat1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat1);
+    }
+
     __fp16 *mat0_data = (__fp16 *)mat0->data;
     __fp16 *mat1_data = (__fp16 *)mat1->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -37,15 +44,17 @@ int shl_rvv_matmul_block_fp16(struct csinn_tensor *mat0, struct csinn_tensor *ma
     /* compute the outer size */
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
-    const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
+    const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1;
             if (!(mat1->is_const)) {
@@ -74,11 +83,7 @@ int shl_rvv_matmul_block_fp16(struct csinn_tensor *mat0, struct csinn_tensor *ma
             }
             // requantize
             shl_rvv_sidcso_op_requantize_fp16(mat0, output, mat1);
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1;
             if (!(mat1->is_const)) {
@@ -109,8 +114,7 @@ int shl_rvv_matmul_block_fp16(struct csinn_tensor *mat0, struct csinn_tensor *ma
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupported this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -121,6 +125,10 @@ int shl_rvv_matmul_block_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_ten
                                      struct csinn_matmul_params *params, const int M_BLK,
                                      const int K_BLK, const int N_BLK)
 {
+    if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0);
+    }
+
     __fp16 *mat0_data = (__fp16 *)mat0->data;
     int8_t *mat1_data = (int8_t *)mat1->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -132,12 +140,14 @@ int shl_rvv_matmul_block_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_ten
     /* compute the outer size */
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
-    const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
+    const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
     int32_t zp = mat1->qinfo->zero_point;
     float scale = mat1->qinfo->scale;
@@ -145,8 +155,8 @@ int shl_rvv_matmul_block_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_ten
     int api = params->base.api;
     int size1 = csinn_tensor_size(mat1);
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16));
             shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale);
@@ -162,11 +172,7 @@ int shl_rvv_matmul_block_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_ten
             }
             shl_mem_free(in0);
             shl_mem_free(in1);
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
             __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16));
             shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale);
@@ -187,8 +193,7 @@ int shl_rvv_matmul_block_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_ten
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupported this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -234,7 +239,7 @@ static inline void reorder_matb_pack2nxk_fp16_w_int8(int8_t *src, int8_t *dst, i
  * packn = vlenb / sizeof(__fp16)
  * src: [k, n]
  * dst: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n]
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_rvv_matmul_reorder_weight_fp16_w_int8(struct csinn_tensor *mat1, const int K_BLK,
@@ -289,7 +294,7 @@ void shl_rvv_matmul_reorder_weight_fp16_w_int8(struct csinn_tensor *mat1, const
  * packn = vlenb / sizeof(__fp16)
  * src: [k, n]
  * dst: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n]
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_rvv_matmul_reorder_weight_fp16(struct csinn_tensor *mat1, const int K_BLK, const int N_BLK)
@@ -329,20 +334,23 @@ int shl_rvv_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat
                              struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     struct csinn_callback *cb = params->base.cb;
-    if (mat0->dtype == CSINN_DTYPE_FLOAT16) {
-        if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) {
-            shl_rvv_matmul_reorder_weight_fp16_w_int8(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
-        } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) {
-            if (mat1->is_const) {
-                shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+    if (!params->trans_a && !params->trans_b) {
+        if (mat0->dtype == CSINN_DTYPE_FLOAT16) {
+            if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) {
+                shl_rvv_matmul_reorder_weight_fp16_w_int8(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+            } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) {
+                if (mat1->is_const) {
+                    shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+                }
             }
-        } else {
-            shl_debug_error("mat1 unsupported dtype: %d\n", mat1->dtype);
-            return CSINN_FALSE;
+            cb->exec = shl_rvv_matmul_fp16;
         }
-        cb->exec = shl_rvv_matmul_fp16;
-    } else {
-        shl_debug_error("mat0 unsupported dtype: %d\n", mat0->dtype);
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "matmul is not optimized to achieve under this condition, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_matmul_quant;
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/fp16/maxpool.c b/source/thead_rvv/fp16/maxpool.c
index cd16498e..8e8a1a0b 100644
--- a/source/thead_rvv/fp16/maxpool.c
+++ b/source/thead_rvv/fp16/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
 {
@@ -46,6 +46,8 @@ int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global maxpool2d // TODO: remove
diff --git a/source/thead_rvv/fp16/maxpool_2x2_fp16.c b/source/thead_rvv/fp16/maxpool_2x2_fp16.c
index f326825d..cb788945 100644
--- a/source/thead_rvv/fp16/maxpool_2x2_fp16.c
+++ b/source/thead_rvv/fp16/maxpool_2x2_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/maxpool_2x2_fp16_packn.c b/source/thead_rvv/fp16/maxpool_2x2_fp16_packn.c
index 8bff589f..acd9b357 100644
--- a/source/thead_rvv/fp16/maxpool_2x2_fp16_packn.c
+++ b/source/thead_rvv/fp16/maxpool_2x2_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp16/maxpool_3x3_fp16.c b/source/thead_rvv/fp16/maxpool_3x3_fp16.c
index 50acf262..95210ad5 100644
--- a/source/thead_rvv/fp16/maxpool_3x3_fp16.c
+++ b/source/thead_rvv/fp16/maxpool_3x3_fp16.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp16/maxpool_3x3_fp16_packn.c b/source/thead_rvv/fp16/maxpool_3x3_fp16_packn.c
index 5a2f91c1..6675c9ae 100644
--- a/source/thead_rvv/fp16/maxpool_3x3_fp16_packn.c
+++ b/source/thead_rvv/fp16/maxpool_3x3_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp16/maxpool_fp16_nhwc.c b/source/thead_rvv/fp16/maxpool_fp16_nhwc.c
index 98abb580..f2776ee4 100644
--- a/source/thead_rvv/fp16/maxpool_fp16_nhwc.c
+++ b/source/thead_rvv/fp16/maxpool_fp16_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp16/maxpool_fp16_packn.c b/source/thead_rvv/fp16/maxpool_fp16_packn.c
index 91096396..97af0e64 100644
--- a/source/thead_rvv/fp16/maxpool_fp16_packn.c
+++ b/source/thead_rvv/fp16/maxpool_fp16_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp16/mul.c b/source/thead_rvv/fp16/mul.c
index b57599ec..6c9f9a8b 100644
--- a/source/thead_rvv/fp16/mul.c
+++ b/source/thead_rvv/fp16/mul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static void elementwise_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
                                  struct csinn_tensor *output)
@@ -58,6 +58,45 @@ static void broadcast_single_1_mul_fp16(struct csinn_tensor *input0, struct csin
     }
 }
 
+static inline void mul_vv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _b = vle16_v_f16m4(in1, vl);
+        vfloat16m4_t _c = vfmul_vv_f16m4(_a, _b, vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void mul_vf_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _c = vfmul_vf_f16m4(_a, in1[0], vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void mul_fv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    mul_vf_f16m4(in1, in0, out, size);
+}
+
+void *mul_cb_fp16[] = {
+    [CSINN_BROADCAST_VV] = mul_vv_f16m4,
+    [CSINN_BROADCAST_VS] = mul_vf_f16m4,
+    [CSINN_BROADCAST_SV] = mul_fv_f16m4,
+};
+
 int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
@@ -87,8 +126,7 @@ int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
         // requantize
         shl_rvv_sidcso_op_requantize_fp16(input0, output, input1);
     } else {
-        /* TODO: recursive opt */
-        return shl_ref_mul_quant(input0, input1, output, params);
+        return shl_rvv_binary_op_broadcast_fp16(input0, input1, output, mul_cb_fp16);
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/fp16/pad.c b/source/thead_rvv/fp16/pad.c
index 5032c176..3d0988eb 100644
--- a/source/thead_rvv/fp16/pad.c
+++ b/source/thead_rvv/fp16/pad.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 void shl_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
                             int padded_h, int padded_w, int pad_top, int pad_left)
 {
diff --git a/source/thead_rvv/fp16/prelu.c b/source/thead_rvv/fp16/prelu.c
index b3015d10..8d653bf1 100644
--- a/source/thead_rvv/fp16/prelu.c
+++ b/source/thead_rvv/fp16/prelu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_prelu_fp16(struct csinn_tensor *input, struct csinn_tensor *alpha,
                        struct csinn_tensor *output, struct csinn_prelu_params *params)
diff --git a/source/thead_rvv/fp16/relu.c b/source/thead_rvv/fp16/relu.c
index 5aaa1b65..7b96f6a1 100644
--- a/source/thead_rvv/fp16/relu.c
+++ b/source/thead_rvv/fp16/relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                       struct csinn_relu_params *params)
diff --git a/source/thead_rvv/fp16/relu6.c b/source/thead_rvv/fp16/relu6.c
index d3d2c1d7..5f022ef9 100644
--- a/source/thead_rvv/fp16/relu6.c
+++ b/source/thead_rvv/fp16/relu6.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                        struct csinn_relu_params *params)
diff --git a/source/thead_rvv/fp16/reshape.c b/source/thead_rvv/fp16/reshape.c
index 3df57d5b..df97575b 100644
--- a/source/thead_rvv/fp16/reshape.c
+++ b/source/thead_rvv/fp16/reshape.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_reshape_params *params)
@@ -25,7 +25,7 @@ int shl_rvv_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output
     __fp16 *output_data = (__fp16 *)output->data;
 
     shl_gref_reshape_infer_shape(input, output, params);
-    if (input->layout >= CSINN_LAYOUT_NC1WC0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
         const int packn = csrr_vlenb() / sizeof(__fp16);
         const int vl = vsetvl_e16m1(packn);
         int outer_size = input->dim[0] * input->dim[1];  // batch fuse to outer
diff --git a/source/thead_rvv/fp16/sigmoid.c b/source/thead_rvv/fp16/sigmoid.c
index 312be2e4..6068615d 100644
--- a/source/thead_rvv/fp16/sigmoid.c
+++ b/source/thead_rvv/fp16/sigmoid.c
@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 
+#include "rvv/rvv.h"
 #include "rvv_mathfun_fp16.h"
-#include "shl_thead_rvv.h"
 
 int shl_rvv_sigmoid_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_sigmoid_params *params)
diff --git a/source/thead_rvv/fp16/softmax.c b/source/thead_rvv/fp16/softmax.c
index 8cf21de5..ad3e58a9 100644
--- a/source/thead_rvv/fp16/softmax.c
+++ b/source/thead_rvv/fp16/softmax.c
@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 
+#include "rvv/rvv.h"
 #include "rvv_mathfun_fp16.h"
-#include "shl_thead_rvv.h"
 
 static inline __fp16 fast_exp16(__fp16 y)
 {
diff --git a/source/thead_rvv/fp16/strided_slice.c b/source/thead_rvv/fp16/strided_slice.c
index ce519ebc..301f3c9f 100644
--- a/source/thead_rvv/fp16/strided_slice.c
+++ b/source/thead_rvv/fp16/strided_slice.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static int get_index(int32_t *dim, int32_t *idx, int32_t dim_count)
 {
@@ -30,10 +30,14 @@ static int get_index(int32_t *dim, int32_t *idx, int32_t dim_count)
 int shl_rvv_strided_slice_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                struct csinn_strided_slice_params *params)
 {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
+    }
+
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
 
-    for (int i = 0; i < params->slice_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         if (params->begin[i] < -input->dim[i]) params->begin[i] = -input->dim[i];
         if (params->begin[i] < 0) params->begin[i] += input->dim[i];
         if (params->begin[i] > input->dim[i]) params->begin[i] = input->dim[i];
diff --git a/source/thead_rvv/fp16/sub.c b/source/thead_rvv/fp16/sub.c
new file mode 100644
index 00000000..7b253560
--- /dev/null
+++ b/source/thead_rvv/fp16/sub.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static inline void sub_vv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _b = vle16_v_f16m4(in1, vl);
+        vfloat16m4_t _c = vfsub_vv_f16m4(_a, _b, vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void sub_vf_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _a = vle16_v_f16m4(in0, vl);
+        vfloat16m4_t _c = vfsub_vf_f16m4(_a, in1[0], vl);
+        vse16_v_f16m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void sub_fv_f16m4(__fp16 *in0, __fp16 *in1, __fp16 *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _b = vle16_v_f16m4(in1, vl);
+        vfloat16m4_t _c = vfrsub_vf_f16m4(_b, in0[0], vl);
+        vse16_v_f16m4(out, _c, vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *sub_cb_fp16[] = {
+    [CSINN_BROADCAST_VV] = sub_vv_f16m4,
+    [CSINN_BROADCAST_VS] = sub_vf_f16m4,
+    [CSINN_BROADCAST_SV] = sub_fv_f16m4,
+};
+
+int shl_rvv_sub_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    return shl_rvv_binary_op_broadcast_fp16(input0, input1, output, sub_cb_fp16);
+}
diff --git a/source/thead_rvv/fp16/transpose.c b/source/thead_rvv/fp16/transpose.c
index 63d496cc..a9628c2c 100644
--- a/source/thead_rvv/fp16/transpose.c
+++ b/source/thead_rvv/fp16/transpose.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static void transpose_021_fp16(__fp16 *src, __fp16 *dst, int batch, int inner_size, int outer_size)
 {
@@ -89,9 +89,10 @@ static int transpose_tail_coincide_fp16(struct csinn_tensor *input, struct csinn
 int shl_rvv_transpose_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_transpose_params *params)
 {
-    if (input->layout >= CSINN_LAYOUT_NC1WC0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
-        return shl_ref_transpose_quant(input, output, params);
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input);
     }
+
     if (params->permute_num == 4 && params->permute[0] == 0 && params->permute[1] == 1 &&
         params->permute[2] == 2 && params->permute[3] == 3) {
         __fp16 *input_data = (__fp16 *)input->data;
diff --git a/source/thead_rvv/fp32/add.c b/source/thead_rvv/fp32/add.c
index e91dd036..6382d0e5 100644
--- a/source/thead_rvv/fp32/add.c
+++ b/source/thead_rvv/fp32/add.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -61,6 +61,45 @@ static void broadcast_single_1_add_fp32(struct csinn_tensor *input0, struct csin
     }
 }
 
+static inline void add_vv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _b = vle32_v_f32m4(in1, vl);
+        vfloat32m4_t _c = vfadd_vv_f32m4(_a, _b, vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void add_vf_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _c = vfadd_vf_f32m4(_a, in1[0], vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void add_fv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    add_vf_f32m4(in1, in0, out, size);
+}
+
+void *add_cb_fp32[] = {
+    [CSINN_BROADCAST_VV] = add_vv_f32m4,
+    [CSINN_BROADCAST_VS] = add_vf_f32m4,
+    [CSINN_BROADCAST_SV] = add_fv_f32m4,
+};
+
 int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
@@ -86,8 +125,7 @@ int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
         }
         broadcast_single_1_add_fp32(input0, input1, output);
     } else {
-        /* TODO: recursive opt */
-        return shl_ref_add_quant(input0, input1, output, params);
+        return shl_rvv_binary_op_broadcast_fp32(input0, input1, output, add_cb_fp32);
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/fp32/avgpool.c b/source/thead_rvv/fp32/avgpool.c
index 9f573cc3..0853652d 100644
--- a/source/thead_rvv/fp32/avgpool.c
+++ b/source/thead_rvv/fp32/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global avgpool2d
diff --git a/source/thead_rvv/fp32/avgpool_2x2_fp32.c b/source/thead_rvv/fp32/avgpool_2x2_fp32.c
index 9de72def..6434ba4c 100644
--- a/source/thead_rvv/fp32/avgpool_2x2_fp32.c
+++ b/source/thead_rvv/fp32/avgpool_2x2_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/avgpool_2x2_fp32_packn.c b/source/thead_rvv/fp32/avgpool_2x2_fp32_packn.c
index caf8076a..ad0674bb 100644
--- a/source/thead_rvv/fp32/avgpool_2x2_fp32_packn.c
+++ b/source/thead_rvv/fp32/avgpool_2x2_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp32/avgpool_3x3_fp32.c b/source/thead_rvv/fp32/avgpool_3x3_fp32.c
index baaded0f..5ed7bbf3 100644
--- a/source/thead_rvv/fp32/avgpool_3x3_fp32.c
+++ b/source/thead_rvv/fp32/avgpool_3x3_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/avgpool_3x3_fp32_packn.c b/source/thead_rvv/fp32/avgpool_3x3_fp32_packn.c
index 6acd2fb9..76a494bb 100644
--- a/source/thead_rvv/fp32/avgpool_3x3_fp32_packn.c
+++ b/source/thead_rvv/fp32/avgpool_3x3_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp32/avgpool_fp32_nhwc.c b/source/thead_rvv/fp32/avgpool_fp32_nhwc.c
index e48bffb2..8ff37030 100644
--- a/source/thead_rvv/fp32/avgpool_fp32_nhwc.c
+++ b/source/thead_rvv/fp32/avgpool_fp32_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
diff --git a/source/thead_rvv/fp32/avgpool_fp32_packn.c b/source/thead_rvv/fp32/avgpool_fp32_packn.c
index b0124cbc..13d249e3 100644
--- a/source/thead_rvv/fp32/avgpool_fp32_packn.c
+++ b/source/thead_rvv/fp32/avgpool_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * constrain: input channel % packn = 0
diff --git a/source/thead_rvv/fp32/clip.c b/source/thead_rvv/fp32/clip.c
index 6e13cbd5..5a7c17fb 100644
--- a/source/thead_rvv/fp32/clip.c
+++ b/source/thead_rvv/fp32/clip.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp32/concat.c b/source/thead_rvv/fp32/concat.c
index f99ed502..12f9dba6 100644
--- a/source/thead_rvv/fp32/concat.c
+++ b/source/thead_rvv/fp32/concat.c
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static int shl_rvv_concat_ndarray_fp32(struct csinn_tensor **input, struct csinn_tensor *output,
                                        struct csinn_concat_params *params)
diff --git a/source/thead_rvv/fp32/convolution.c b/source/thead_rvv/fp32/convolution.c
index 58347b70..0f1b00dd 100644
--- a/source/thead_rvv/fp32/convolution.c
+++ b/source/thead_rvv/fp32/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -30,8 +30,8 @@ int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     const int packn = csrr_vlenb() / sizeof(float);
@@ -48,19 +48,22 @@ int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *ou
         if (shl_is_first_layer_input(input, sess)) {
             in_elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
     bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
     // packn
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             if (!binary_model_op_init) {
                 shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params);
             }
             cb->exec = shl_rvv_conv1x1s1_gemm_packn_fp32;
         } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-                   dalition_h == 1 && dalition_w == 1) {
+                   dilation_h == 1 && dilation_w == 1) {
             if (params->group > 1) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
                 if (!binary_model_op_init) {
@@ -94,8 +97,8 @@ int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *ou
     // pack1ton
     if (in_elempack % packn != 0 && out_elempack % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             if (!binary_model_op_init) {
                 shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
             }
@@ -111,8 +114,8 @@ int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *ou
     // packnto1
     if (in_elempack % packn == 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             if (!binary_model_op_init) {
                 shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params);
             }
@@ -128,8 +131,8 @@ int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *ou
     // pack1
     if (in_elempack % packn != 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             if (!binary_model_op_init) {
                 shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params);
             }
diff --git a/source/thead_rvv/fp32/convolution1d.c b/source/thead_rvv/fp32/convolution1d.c
new file mode 100644
index 00000000..3ed6992b
--- /dev/null
+++ b/source/thead_rvv/fp32/convolution1d.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_conv1d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv1d_params *params)
+{
+    int32_t out_c = kernel->dim[0] / params->group;
+    int32_t in_c = kernel->dim[1];
+    int32_t in_w = input->dim[2];
+    int32_t kernel_w = kernel->dim[2];
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_w = params->dilation_width;
+
+    struct csinn_callback *cb = params->base.cb;
+
+    struct csinn_session *sess = params->base.sess;
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_rvv_option *option = shl_rvv_get_graph_option(sess);
+    }
+
+    bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
+
+    // pack1
+    if (!binary_model_op_init) {
+        shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp32(kernel, params);
+    }
+    cb->exec = shl_rvv_conv1d_im2col_gemm_fp32;
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp32/convolution1d_gemm_fp32.c b/source/thead_rvv/fp32/convolution1d_gemm_fp32.c
new file mode 100644
index 00000000..6dbdf1c8
--- /dev/null
+++ b/source/thead_rvv/fp32/convolution1d_gemm_fp32.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                    struct csinn_conv1d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;
+    int k = kernel->dim[1] * kernel->dim[2];
+
+    float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float));
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv1d_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv1d_params *params)
+{
+    if (input->layout == CSINN_LAYOUT_NC1WC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
+    }
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t in_width = input->dim[2];
+
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_width = output->dim[2];
+
+    int32_t kernel_w = kernel->dim[2];
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t dilation_w = params->dilation_width;
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group * kernel_w;
+    int32_t n = out_width;
+
+    float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+    const int vlen = csrr_vlenb() * 8;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // im2col
+            float *data_col = im2col_data;
+            float *channel_data = input_data;
+            for (int c = 0; c < in_ch / group; c++) {
+                for (int kw = 0; kw < kernel_w; kw++) {
+                    int in_col = -pad_left + kw * dilation_w;
+                    for (int ow1 = 0; ow1 < out_width; ow1++) {
+                        if (in_col < in_width && in_col >= 0) {
+                            *data_col++ = channel_data[in_col];
+                        } else {
+                            *data_col++ = 0.0f;
+                        }
+                        in_col += stride_w;
+                    }
+                }
+                channel_data += in_width;
+            }
+            float *pa = kernel_data + g * m * k;
+            float *pb = pb_reorder;
+            float *pc = output_data;
+            if (vlen == 128) {
+                // pack
+                shl_rvv_reorder_input_z8_fp32(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_rvv_gemm_8x8_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            } else if (vlen >= 256) {
+                shl_rvv256_reorder_input_z16_fp32(im2col_data, pb, k, n, n);
+                shl_rvv256_gemm_8x16_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            }
+            input_data += in_ch / group * in_width;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp32/convolution_1x1_fp32.c b/source/thead_rvv/fp32/convolution_1x1_fp32.c
index a3d4a705..63bc40cc 100644
--- a/source/thead_rvv/fp32/convolution_1x1_fp32.c
+++ b/source/thead_rvv/fp32/convolution_1x1_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params)
@@ -35,9 +35,12 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params,
+                                     void (*reorder_input)(float *, float *, int, int, int),
+                                     void (*gemm)(float *, const float *, const float *, float *,
+                                                  int, int, int, int))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
@@ -59,16 +62,19 @@ int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor
     int32_t n = out_h * out_w;
 
     float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+    const int vlen = csrr_vlenb() * 8;
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
             float *pa = kernel_data + g * m * k;
             float *pb = pb_reorder;
             float *pc = output_data;
+
             // pack
-            shl_rvv_reorder_input_z8_fp32(input_data, pb, k, n, n);
+            reorder_input(input_data, pb, k, n, n);
             // GEMM
-            shl_rvv_gemm_8x8_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k, n, n);
+
             input_data += k * n;
             output_data += m * n;
         }
@@ -76,3 +82,19 @@ int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor
     shl_mem_free(pb_reorder);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+    const int vlen = csrr_vlenb() * 8;
+    if (vlen == 128) {
+        return shl_rvv_common_conv1x1_gemm_fp32(input, output, kernel, bias, params,
+                                                shl_rvv_reorder_input_z8_fp32,
+                                                shl_rvv_gemm_8x8_fp32);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv1x1_gemm_fp32(input, output, kernel, bias, params,
+                                                shl_rvv256_reorder_input_z16_fp32,
+                                                shl_rvv256_gemm_8x16_fp32);
+    }
+}
diff --git a/source/thead_rvv/fp32/convolution_1x1_fp32_pack1ton.c b/source/thead_rvv/fp32/convolution_1x1_fp32_pack1ton.c
index fd9910cf..88d4ca9a 100644
--- a/source/thead_rvv/fp32/convolution_1x1_fp32_pack1ton.c
+++ b/source/thead_rvv/fp32/convolution_1x1_fp32_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -28,9 +28,11 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *ke
     shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_pack1ton_fp32(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(float *, float *, int, int, int, int),
+    void (*gemm)(float *, const float *, const float *, float *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
@@ -73,10 +75,10 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csin
             shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w);
 
             // reorder(pack)
-            shl_rvv_reorder_input_z12_pack1ton_fp32(input_ncxhwx, in_ptr, k, 1, n, n);
+            reorder_input(input_ncxhwx, in_ptr, k, 1, n, n);
 
             // gemm
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+            gemm(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, false);
 
             input_data += k * n;
             output_data += m * n;
@@ -86,3 +88,12 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csin
     shl_mem_free(input_ncxhwx);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_pack1ton_fp32(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_fp32,
+                                                     shl_rvv_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/thead_rvv/fp32/convolution_1x1_fp32_packn.c b/source/thead_rvv/fp32/convolution_1x1_fp32_packn.c
index 49452549..765e9233 100644
--- a/source/thead_rvv/fp32/convolution_1x1_fp32_packn.c
+++ b/source/thead_rvv/fp32/convolution_1x1_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
                                                       struct csinn_conv2d_params *params)
@@ -24,9 +24,12 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kerne
     shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                      struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(float *, float *, int, int, int),
+                                           void (*gemm)(float *, const float *, const float *,
+                                                        float *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
@@ -64,9 +67,9 @@ int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_t
             float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
 
             // pack
-            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
+            reorder_input(input_data, in_ptr, k, n, n);
             // GEMM
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+            gemm(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, false);
 
             input_data += k * n;
             output_data += m * n;
@@ -75,3 +78,12 @@ int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_t
     shl_mem_free(pb_reorder);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packn_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp32,
+                                                  shl_rvv_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/thead_rvv/fp32/convolution_1x1_fp32_packnto1.c b/source/thead_rvv/fp32/convolution_1x1_fp32_packnto1.c
index 67b9ca74..785ae35a 100644
--- a/source/thead_rvv/fp32/convolution_1x1_fp32_packnto1.c
+++ b/source/thead_rvv/fp32/convolution_1x1_fp32_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params)
@@ -24,9 +24,11 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *ke
     shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_packnto1_fp32(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(float *, float *, int, int, int),
+    void (*gemm)(float *, const float *, const float *, float *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
@@ -58,10 +60,9 @@ int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csin
             float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
 
             // pack
-            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
+            reorder_input(input_data, in_ptr, k, n, n);
             // GEMM
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                               n);
+            gemm(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, false);
 
             shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
 
@@ -73,3 +74,12 @@ int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csin
     shl_mem_free(output_ncxhwx);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv1x1_gemm_packnto1_fp32(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_fp32,
+                                                     shl_rvv_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c b/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c
index 91382715..d548c2f0 100644
--- a/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c
+++ b/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/fp32/convolution_gemm_fp32.c b/source/thead_rvv/fp32/convolution_gemm_fp32.c
index 602d687e..787ba022 100644
--- a/source/thead_rvv/fp32/convolution_gemm_fp32.c
+++ b/source/thead_rvv/fp32/convolution_gemm_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -39,9 +39,12 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+int shl_rvv_common_conv_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                  struct csinn_conv2d_params *params)
+                                  struct csinn_conv2d_params *params,
+                                  void (*reorder_input)(float *, float *, int, int, int),
+                                  void (*gemm)(float *, const float *, const float *, float *, int,
+                                               int, int, int))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
@@ -74,6 +77,7 @@ int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tenso
 
     float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float));
     float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+    const int vlen = csrr_vlenb() * 8;
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -113,9 +117,10 @@ int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tenso
             float *pc = output_data;
 
             // pack
-            shl_rvv_reorder_input_z8_fp32(im2col_data, pb, k, n, n);
+            reorder_input(im2col_data, pb, k, n, n);
             // GEMM
-            shl_rvv_gemm_8x8_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k, n, n);
+
             input_data += in_ch / group * in_height * in_width;
             output_data += m * n;
         }
@@ -124,3 +129,18 @@ int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tenso
     shl_mem_free(im2col_data);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+    const int vlen = csrr_vlenb() * 8;
+    if (vlen == 128) {
+        return shl_rvv_common_conv_gemm_fp32(input, output, kernel, bias, params,
+                                             shl_rvv_reorder_input_z8_fp32, shl_rvv_gemm_8x8_fp32);
+    } else if (vlen >= 256) {
+        return shl_rvv_common_conv_gemm_fp32(input, output, kernel, bias, params,
+                                             shl_rvv256_reorder_input_z16_fp32,
+                                             shl_rvv256_gemm_8x16_fp32);
+    }
+}
diff --git a/source/thead_rvv/fp32/convolution_gemm_fp32_pack1ton.c b/source/thead_rvv/fp32/convolution_gemm_fp32_pack1ton.c
index ee407e9d..7ea64341 100644
--- a/source/thead_rvv/fp32/convolution_gemm_fp32_pack1ton.c
+++ b/source/thead_rvv/fp32/convolution_gemm_fp32_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(float)
@@ -117,9 +117,11 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                           struct csinn_conv2d_params *params)
+int shl_rvv_common_conv_gemm_pack1ton_fp32(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(float *, float *, int, int, int, int),
+    void (*gemm)(float *, const float *, const float *, float *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
@@ -207,14 +209,13 @@ int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct cs
 
             // reorder(pack)
             float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
-            shl_rvv_reorder_input_z12_pack1ton_fp32(im2col_buf, reorder_buf, in_cp, maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp, maxk, n, n);
             shl_mem_free(im2col_buf);
 
             // gemm
             float *ker_ptr = kernel_data + g * m * maxk * in_cp;
             float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
+            gemm(output_data, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n, false);
             shl_mem_free(reorder_buf);
 
             input_data += in_cp * in_h * in_w;
@@ -223,3 +224,12 @@ int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct cs
     }
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_pack1ton_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_fp32,
+                                                  shl_rvv_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/thead_rvv/fp32/convolution_gemm_fp32_packn.c b/source/thead_rvv/fp32/convolution_gemm_fp32_packn.c
index d130503d..3fd4c127 100644
--- a/source/thead_rvv/fp32/convolution_gemm_fp32_packn.c
+++ b/source/thead_rvv/fp32/convolution_gemm_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(float)
@@ -93,9 +93,12 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *ker
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+int shl_rvv_common_conv_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                        struct csinn_conv2d_params *params)
+                                        struct csinn_conv2d_params *params,
+                                        void (*reorder_input)(float *, float *, int, int, int),
+                                        void (*gemm)(float *, const float *, const float *, float *,
+                                                     int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
@@ -177,14 +180,13 @@ int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn
 
             // reorder(pack)
             float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
-            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp * maxk, n, n);
             shl_mem_free(im2col_buf);
 
             // gemm
             float *ker_ptr = kernel_data + g * m * maxk * in_cp;
             float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
+            gemm(output_data, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n, false);
             shl_mem_free(reorder_buf);
 
             input_data += in_cp * in_h * in_w;
@@ -193,3 +195,12 @@ int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn
     }
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packn_fp32(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_fp32,
+                                               shl_rvv_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/thead_rvv/fp32/convolution_gemm_fp32_packnto1.c b/source/thead_rvv/fp32/convolution_gemm_fp32_packnto1.c
index ee612a6e..fb3a54f8 100644
--- a/source/thead_rvv/fp32/convolution_gemm_fp32_packnto1.c
+++ b/source/thead_rvv/fp32/convolution_gemm_fp32_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(float)
@@ -111,9 +111,12 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *
     shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+int shl_rvv_common_conv_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                           struct csinn_conv2d_params *params)
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(float *, float *, int, int, int),
+                                           void (*gemm)(float *, const float *, const float *,
+                                                        float *, int, int, int, bool))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(input);
@@ -189,14 +192,13 @@ int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct cs
 
             // reorder(pack)
             float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
-            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp * maxk, n, n);
             shl_mem_free(im2col_buf);
 
             // gemm
             float *ker_ptr = kernel_data + g * m * maxk * in_cp;
             float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
-            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                               in_cp * maxk, n, n);
+            gemm(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n, false);
             shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
 
             shl_mem_free(reorder_buf);
@@ -208,3 +210,12 @@ int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct cs
     shl_mem_free(output_ncxhwx);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    return shl_rvv_common_conv_gemm_packnto1_fp32(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_fp32,
+                                                  shl_rvv_ncxhwx_gemm_12xpack2n_fp32);
+}
diff --git a/source/thead_rvv/fp32/deconvolution.c b/source/thead_rvv/fp32/deconvolution.c
new file mode 100644
index 00000000..d940f7b4
--- /dev/null
+++ b/source/thead_rvv/fp32/deconvolution.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+#include "shl_debug.h"
+
+int shl_rvv_deconv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params)
+{
+    struct csinn_callback *cb = params->base.cb;
+
+    struct csinn_session *sess = params->base.sess;
+
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) {
+        struct shl_rvv_option *option = shl_rvv_get_graph_option(sess);
+    }
+
+    bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
+
+    // pack1
+    params->conv_extra.conv_mode = CSINN_GEMM;
+
+    if (!binary_model_op_init) {
+        shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp32(kernel, params);
+    }
+    cb->exec = shl_rvv_deconv2d_gemm_col2im_fp32;
+
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp32/deconvolution_gemm_fp32.c b/source/thead_rvv/fp32/deconvolution_gemm_fp32.c
new file mode 100644
index 00000000..738f1c5a
--- /dev/null
+++ b/source/thead_rvv/fp32/deconvolution_gemm_fp32.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static void transpose_10_fp32(float *src, float *dst, int inner_size, int outer_size)
+{
+    for (int i = 0; i < outer_size; i++) {
+        int size = inner_size;
+        float *d_ptr = dst + i;
+        while (size > 0) {
+            int vl = vsetvl_e32m4(size);
+            vfloat32m4_t _in = vle32_v_f32m4(src, vl);
+            src += vl;
+            vsse32_v_f32m4(d_ptr, outer_size * sizeof(float), _in, vl);
+            d_ptr += vl * outer_size;
+            size -= vl;
+        }
+    }
+}
+
+// Kernel:[IC,OC,KH,KW] --> [OC,KH,KW,IC]
+void shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    float *data_buf = shl_mem_alloc(kernel->dim[0] * kernel->dim[1] * kernel->dim[2] *
+                                    kernel->dim[3] * sizeof(float));
+
+    int inner_size = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    int outer_size = kernel->dim[0];
+
+    transpose_10_fp32(kernel_data, data_buf, inner_size, outer_size);
+
+    int group = params->group;
+
+    int k = kernel->dim[0];
+    int m = kernel->dim[1] * kernel->dim[2] * kernel->dim[3] / group;
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp32(data_buf + g * m * k, kernel_data + g * m * k, m, k, k);
+    }
+    shl_mem_free(data_buf);
+}
+
+//判断a<b
+inline static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); }
+
+static void col2im_cpu_ext(const float *data_col, const float *bias, const int batch,
+                           const int channels, const int height, const int width,
+                           const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w, const int dilation_h,
+                           const int dilation_w, float *data_im)
+{
+    const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+    const int channel_size = height * width;
+    const int batch_size = channels * height * width;
+
+    int channel, kernel_row, kernel_col, output_rows, output_col;
+    for (int b = 0; b < batch; b++) {
+        for (channel = 0; channel < channels; channel++) {
+            for (int i = 0; i < channel_size; i++) {
+                data_im[i] = (!bias) ? 0.0f : bias[channel];
+            }
+            for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+                for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+                    int input_row = -pad_h + kernel_row * dilation_h;
+                    for (output_rows = output_h; output_rows; output_rows--) {
+                        if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+                            data_col += output_w;
+                        } else {
+                            int input_col = -pad_w + kernel_col * dilation_w;
+                            for (output_col = output_w; output_col; output_col--) {
+                                if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                                    data_im[input_row * width + input_col] += *data_col;
+                                }
+                                data_col++;
+                                input_col += stride_w;
+                            }
+                        }
+                        input_row += stride_h;
+                    }
+                }
+            }
+            data_im += channel_size;
+        }
+    }
+}
+
+// Data format : NCHW  Input:[N,IC,IH,IW] Kernel:[OC,KH,KW,IC] Output:[N,OC,OH,OW]
+int shl_rvv_deconv2d_gemm_col2im_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    if (input->layout == CSINN_LAYOUT_NC1HWC0) {
+        shl_debug_info("Data Format: NC1HWC0\n");
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
+    } else if (input->layout != CSINN_LAYOUT_NCHW) {
+        shl_debug_error("Unsupported data format\n");
+        return CSINN_FALSE;
+    }
+
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+
+    int32_t group = params->group;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_t = params->pad_top;
+    int32_t pad_l = params->pad_left;
+    int32_t pad_d = params->pad_down;
+    int32_t pad_r = params->pad_right;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
+    int32_t out_pad_h = params->out_pad_height;
+    int32_t out_pad_w = params->out_pad_width;
+
+    int32_t m = out_c / group * kernel_h * kernel_w;
+    int32_t k = in_c / group;
+    int32_t n = in_h * in_w;
+
+    float *reorder_buf = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *output_buf = (float *)shl_mem_alloc(batch * group * m * n * sizeof(float));
+    const int vlen = csrr_vlenb() * 8;
+
+    float *output_buf_ptr = output_buf;
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            if (vlen == 128) {
+                // pack
+                shl_rvv_reorder_input_z8_fp32(input_data, reorder_buf, k, n, n);
+                // Gemm
+                shl_rvv_gemm_8x8_fp32(output_buf_ptr, (kernel_data + g * m * k), reorder_buf, NULL,
+                                      m, k, n, n);
+            } else {
+                shl_debug_error("The vector length is temporarily not supported.");
+            }
+            input_data += k * n;
+            output_buf_ptr += m * n;
+        }
+    }
+    shl_mem_free(reorder_buf);
+
+    col2im_cpu_ext(output_buf, bias_data, batch, out_c, out_h, out_w, kernel_h, kernel_w, pad_t,
+                   pad_l, stride_h, stride_w, dilation_h, dilation_w, output_data);
+
+    shl_mem_free(output_buf);
+
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp32/depthwise_convolution.c b/source/thead_rvv/fp32/depthwise_convolution.c
index e686a78a..4ab34c9b 100644
--- a/source/thead_rvv/fp32/depthwise_convolution.c
+++ b/source/thead_rvv/fp32/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -47,6 +47,9 @@ int shl_rvv_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_
             in_elempack = 1;
             out_elempack = 1;  // dwconv2d out_channel pack is same as in_channel
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
     bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess);
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
diff --git a/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32.c b/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32.c
index 2564e6f2..b91e5f48 100644
--- a/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32.c
+++ b/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32_packn.c b/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32_packn.c
index d9a0a3fe..0d1a487d 100644
--- a/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32_packn.c
+++ b/source/thead_rvv/fp32/depthwise_convolution_3x3_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/fp32/depthwise_convolution_fp32_nhwc.c b/source/thead_rvv/fp32/depthwise_convolution_fp32_nhwc.c
index 1430fb0c..ebfce721 100644
--- a/source/thead_rvv/fp32/depthwise_convolution_fp32_nhwc.c
+++ b/source/thead_rvv/fp32/depthwise_convolution_fp32_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/fp32/depthwise_convolution_fp32_packn.c b/source/thead_rvv/fp32/depthwise_convolution_fp32_packn.c
index dac897d9..b7738005 100644
--- a/source/thead_rvv/fp32/depthwise_convolution_fp32_packn.c
+++ b/source/thead_rvv/fp32/depthwise_convolution_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/fp32/div.c b/source/thead_rvv/fp32/div.c
new file mode 100644
index 00000000..538d6019
--- /dev/null
+++ b/source/thead_rvv/fp32/div.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static inline void div_vv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _b = vle32_v_f32m4(in1, vl);
+        vfloat32m4_t _c = vfdiv_vv_f32m4(_a, _b, vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void div_vf_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _c = vfdiv_vf_f32m4(_a, in1[0], vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void div_fv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _b = vle32_v_f32m4(in1, vl);
+        vfloat32m4_t _c = vfrdiv_vf_f32m4(_b, in0[0], vl);
+        vse32_v_f32m4(out, _c, vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *div_cb_fp32[] = {
+    [CSINN_BROADCAST_VV] = div_vv_f32m4,
+    [CSINN_BROADCAST_VS] = div_vf_f32m4,
+    [CSINN_BROADCAST_SV] = div_fv_f32m4,
+};
+
+int shl_rvv_div_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    return shl_rvv_binary_op_broadcast_fp32(input0, input1, output, div_cb_fp32);
+}
diff --git a/source/thead_rvv/fp32/erf.c b/source/thead_rvv/fp32/erf.c
new file mode 100644
index 00000000..ecf2d955
--- /dev/null
+++ b/source/thead_rvv/fp32/erf.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+#define a1 0.0705230784
+#define a2 0.0422820123
+#define a3 0.0092705272
+#define a4 0.0001520143
+#define a5 0.0002765672
+#define a6 0.0000430638
+
+static inline vfloat32m4_t vfpow16_v_f32m4(vfloat32m4_t _x, int vl)
+{
+    vfloat32m4_t _x2 = vfmul_vv_f32m4(_x, _x, vl);
+    vfloat32m4_t _x4 = vfmul_vv_f32m4(_x2, _x2, vl);
+    vfloat32m4_t _x8 = vfmul_vv_f32m4(_x4, _x4, vl);
+    vfloat32m4_t _x16 = vfmul_vv_f32m4(_x8, _x8, vl);
+    return _x16;
+}
+
+/*************************************************************************************
+ * erf(x) = 1 - 1 / (1 + a1*x + a2*x^2 + a3*x^3 + a4*x^4 + a5*x^5 + a6*x^6)^16
+ **************************************************************************************/
+int shl_rvv_erf_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    int size = csinn_tensor_size(input);
+
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _x = vle32_v_f32m4(input_data, vl);
+        input_data += vl;
+
+        vbool8_t _mask = vmflt_vf_f32m4_b8(_x, 0.0f, vl);
+        _x = vfmul_vf_f32m4_m(_mask, _x, _x, -1.0f, vl);
+
+        vfloat32m4_t _x2 = vfmul_vv_f32m4(_x, _x, vl);
+        vfloat32m4_t _x3 = vfmul_vv_f32m4(_x2, _x, vl);
+        vfloat32m4_t _x4 = vfmul_vv_f32m4(_x2, _x2, vl);
+        vfloat32m4_t _x5 = vfmul_vv_f32m4(_x3, _x2, vl);
+        vfloat32m4_t _x6 = vfmul_vv_f32m4(_x3, _x3, vl);
+        _x = vfmul_vf_f32m4(_x, a1, vl);
+        _x2 = vfmul_vf_f32m4(_x2, a2, vl);
+        _x3 = vfmul_vf_f32m4(_x3, a3, vl);
+        _x4 = vfmul_vf_f32m4(_x4, a4, vl);
+        _x5 = vfmul_vf_f32m4(_x5, a5, vl);
+        _x6 = vfmul_vf_f32m4(_x6, a6, vl);
+
+        vfloat32m4_t _t = vfmv_v_f_f32m4(1.0f, vl);
+        _t = vfadd_vv_f32m4(_t, _x, vl);
+        _t = vfadd_vv_f32m4(_t, _x2, vl);
+        _t = vfadd_vv_f32m4(_t, _x3, vl);
+        _t = vfadd_vv_f32m4(_t, _x4, vl);
+        _t = vfadd_vv_f32m4(_t, _x5, vl);
+        _t = vfadd_vv_f32m4(_t, _x6, vl);
+
+        vfloat32m4_t _pow = vfpow16_v_f32m4(_t, vl);
+        vfloat32m4_t _y = vfrdiv_vf_f32m4(_pow, -1.0f, vl);
+        _y = vfadd_vf_f32m4(_y, 1.0f, vl);
+        _y = vfmul_vf_f32m4_m(_mask, _y, _y, -1.0f, vl);
+
+        vse32_v_f32m4(output_data, _y, vl);
+        output_data += vl;
+        size -= vl;
+    }
+
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp32/fullyconnected.c b/source/thead_rvv/fp32/fullyconnected.c
index b164c463..e250ad13 100644
--- a/source/thead_rvv/fp32/fullyconnected.c
+++ b/source/thead_rvv/fp32/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_fullyconnected_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/thead_rvv/fp32/fullyconnected_fp32.c b/source/thead_rvv/fp32/fullyconnected_fp32.c
index a8761926..826befc9 100644
--- a/source/thead_rvv/fp32/fullyconnected_fp32.c
+++ b/source/thead_rvv/fp32/fullyconnected_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -57,6 +57,10 @@ int shl_rvv_fullyconnected_packn_fp32(struct csinn_tensor *input, struct csinn_t
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
                                       struct csinn_fc_params *params)
 {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
+    }
+
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
     float *weights_data = (float *)weights->data;
diff --git a/source/thead_rvv/fp32/gather.c b/source/thead_rvv/fp32/gather.c
new file mode 100644
index 00000000..562ab435
--- /dev/null
+++ b/source/thead_rvv/fp32/gather.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_gather_fp32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                        struct csinn_tensor *output, struct csinn_gather_params *params)
+{
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
+    }
+
+    int input_size = csinn_tensor_size(input);
+    if (input_size == 0) {
+        return CSINN_TRUE;
+    }
+
+    if (input->dtype == CSINN_DTYPE_FLOAT32 && indices->dtype == CSINN_DTYPE_INT64 &&
+        output->dtype == CSINN_DTYPE_FLOAT32) {
+        float *input_data = (float *)input->data;
+        float *output_data = (float *)output->data;
+        int64_t *indices_data = (int64_t *)indices->data;
+
+        int inner_size = 1;
+        for (int i = params->axis + 1; i < input->dim_count; i++) {
+            inner_size *= input->dim[i];
+        }
+        int outer_size = 1;
+        for (int i = 0; i < params->axis; i++) {
+            outer_size *= input->dim[i];
+        }
+        int indices_size = 1;
+        for (int i = 0; i < indices->dim_count; i++) {
+            indices_size *= indices->dim[i];
+        }
+        int axis_shape = input->dim[params->axis];
+        for (int i = 0; i < outer_size; i++) {
+            for (int j = 0; j < indices_size; j++) {
+                if ((indices_data[j] >= 0) && (indices_data[j] < axis_shape)) {
+                    memcpy(output_data, input_data + indices_data[j] * inner_size,
+                           inner_size * sizeof(float));
+                } else if ((indices_data[j] < 0) && (indices_data[j] >= -axis_shape)) {
+                    memcpy(output_data, input_data + (indices_data[j] + axis_shape) * inner_size,
+                           inner_size * sizeof(float));
+                } else {
+                    memset(output_data, 0, inner_size * sizeof(float));
+                }
+                output_data += inner_size;
+            }
+            input_data += inner_size * axis_shape;
+        }
+    } else {
+        return shl_ref_gather_quant(input, indices, output, params);
+    }
+
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fp32/gemm_fp32.c b/source/thead_rvv/fp32/gemm_fp32.c
index 6fbee8a9..b1f1afc0 100644
--- a/source/thead_rvv/fp32/gemm_fp32.c
+++ b/source/thead_rvv/fp32/gemm_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /************************************************************************
  * input matrix and kernel matrix have been reordered
@@ -598,7 +598,7 @@ void shl_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, flo
                 _acc0 = vfmacc_vf_f32m1(_acc0, in_ptr0[c], _kernel, vl);
                 kernel_ptr += 8;
             }
-            vsse32_v_f32m1(out_ptr0, ldc * sizeof(__fp16), _acc0, vl);
+            vsse32_v_f32m1(out_ptr0, ldc * sizeof(float), _acc0, vl);
         }
         kernel_data += 8 * k;
         output_data += 8 * ldc;
diff --git a/source/thead_rvv/fp32/gemm_fp32_block.c b/source/thead_rvv/fp32/gemm_fp32_block.c
index 6f8c4414..6fd13092 100644
--- a/source/thead_rvv/fp32/gemm_fp32_block.c
+++ b/source/thead_rvv/fp32/gemm_fp32_block.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(float)
@@ -676,8 +676,8 @@ static inline void gemm_12xpack2n_fp32(float *dst, const float *sa, const float
 
 /*************************************************************
  * packn = vlenb / sizeof(float)
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 12
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * m_blk: M_BLK, M_tail
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  *
  * dst - output: [m, n]
@@ -699,28 +699,17 @@ void shl_rvv_gemm_block_12xpack2n_fp32(float *dst, const float *sa, const float
         bias = (float *)shl_mem_alloc(m * sizeof(float));
     }
 
-    const int packn = csrr_vlenb() / sizeof(float);
-
-    const int MIN_M_BLK = 12;
-    const int MIN_N_BLK = packn * 2;
-
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
 
         int n_block = N_BLK;
         int n_idx = 0;
         while (n_idx < n) {
-            while (!(n_idx + n_block - 1 < n)) {
-                n_block /= 2;
-            }
-            if (n_block < MIN_N_BLK) {
+            if (n - n_idx < n_block) {
                 n_block = n - n_idx;
             }
 
diff --git a/source/thead_rvv/fp32/gemm_fp32_packn.c b/source/thead_rvv/fp32/gemm_fp32_packn.c
index a70afa9c..004d6818 100644
--- a/source/thead_rvv/fp32/gemm_fp32_packn.c
+++ b/source/thead_rvv/fp32/gemm_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
@@ -30,8 +30,9 @@
  * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
  * sb - input:   [n/8, k, 8]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
-                                       int m, int k, int n, int ldc)
+                                       int m, int k, int n, bool fuse_relu)
 {
     float *kernel_data = (float *)sa;
     float *input_data = (float *)sb;
@@ -399,8 +400,9 @@ void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float
  * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
  * sb - input:   [n/12, k, 12]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
-                                        int m, int k, int n, int ldc)
+                                        int m, int k, int n, bool fuse_relu)
 {
     float *kernel_data = (float *)sa;
     float *input_data = (float *)sb;
diff --git a/source/thead_rvv/fp32/global_avgpool.c b/source/thead_rvv/fp32/global_avgpool.c
index 9b58f70b..32f41de6 100644
--- a/source/thead_rvv/fp32/global_avgpool.c
+++ b/source/thead_rvv/fp32/global_avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/global_avgpool_nhwc.c b/source/thead_rvv/fp32/global_avgpool_nhwc.c
index d80e0938..c297a489 100644
--- a/source/thead_rvv/fp32/global_avgpool_nhwc.c
+++ b/source/thead_rvv/fp32/global_avgpool_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/global_avgpool_packn.c b/source/thead_rvv/fp32/global_avgpool_packn.c
index ae6cd5cf..987288f8 100644
--- a/source/thead_rvv/fp32/global_avgpool_packn.c
+++ b/source/thead_rvv/fp32/global_avgpool_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/fp32/global_maxpool.c b/source/thead_rvv/fp32/global_maxpool.c
index a927e775..dc082e0a 100644
--- a/source/thead_rvv/fp32/global_maxpool.c
+++ b/source/thead_rvv/fp32/global_maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/global_maxpool_nhwc.c b/source/thead_rvv/fp32/global_maxpool_nhwc.c
index 7ea18bfa..e2653250 100644
--- a/source/thead_rvv/fp32/global_maxpool_nhwc.c
+++ b/source/thead_rvv/fp32/global_maxpool_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/global_maxpool_packn.c b/source/thead_rvv/fp32/global_maxpool_packn.c
index 295505fc..7f5dfdc7 100644
--- a/source/thead_rvv/fp32/global_maxpool_packn.c
+++ b/source/thead_rvv/fp32/global_maxpool_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/fp32/layer_norm.c b/source/thead_rvv/fp32/layer_norm.c
index 0d54138d..6761c179 100644
--- a/source/thead_rvv/fp32/layer_norm.c
+++ b/source/thead_rvv/fp32/layer_norm.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: support flexible vlen
diff --git a/source/thead_rvv/fp32/leaky_relu.c b/source/thead_rvv/fp32/leaky_relu.c
index 52194234..e435eab7 100644
--- a/source/thead_rvv/fp32/leaky_relu.c
+++ b/source/thead_rvv/fp32/leaky_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/fp32/matmul.c b/source/thead_rvv/fp32/matmul.c
index deeb1033..d48af9b4 100644
--- a/source/thead_rvv/fp32/matmul.c
+++ b/source/thead_rvv/fp32/matmul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 #define MATMUL_M_BLK 32
 #define MATMUL_K_BLK 64
@@ -26,6 +26,13 @@ int shl_rvv_matmul_block_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma
                               struct csinn_tensor *output, struct csinn_matmul_params *params,
                               const int M_BLK, const int K_BLK, const int N_BLK)
 {
+    if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat0);
+    }
+    if (mat1->layout >= CSINN_LAYOUT_NC1C0 && mat1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat1);
+    }
+
     float *mat0_data = (float *)mat0->data;
     float *mat1_data = (float *)mat1->data;
     float *output_data = (float *)output->data;
@@ -37,15 +44,17 @@ int shl_rvv_matmul_block_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma
     /* compute the outer size */
     for (int i = 0; i < dims_count - 2; i++) {
         batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
         batches_b *= mat1->dim[i];
     }
 
     const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
     const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
-    const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
+    const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
 
-    if (batches_a == batches_b) {
-        if (!params->trans_a && !params->trans_b) {
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
             float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float));
             float *in1;
             if (!(mat1->is_const)) {
@@ -72,11 +81,7 @@ int shl_rvv_matmul_block_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma
             if (!(mat1->is_const)) {
                 shl_mem_free(in1);
             }
-        } else {
-            shl_ref_matmul_quant(mat0, mat1, output, params);
-        }
-    } else if (batches_a > 1 && batches_b == 1) {
-        if (!params->trans_a && !params->trans_b) {
+        } else if (batches_a > 1 && batches_b == 1) {
             float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float));
             float *in1;
             if (!(mat1->is_const)) {
@@ -105,8 +110,7 @@ int shl_rvv_matmul_block_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma
             return CSINN_FALSE;
         }
     } else {
-        shl_debug_error("matmul unsupported this broadcast\n");
-        return CSINN_FALSE;
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
     }
 
     return CSINN_TRUE;
@@ -116,7 +120,7 @@ int shl_rvv_matmul_block_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma
  * packn = vlenb / sizeof(__fp16)
  * src: [k, n]
  * dst: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n]
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_rvv_matmul_reorder_weight_fp32(struct csinn_tensor *mat1, const int K_BLK, const int N_BLK)
@@ -151,18 +155,21 @@ int shl_rvv_matmul_init_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat
                              struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     struct csinn_callback *cb = params->base.cb;
-    if (mat0->dtype == CSINN_DTYPE_FLOAT32) {
-        if (mat1->dtype == CSINN_DTYPE_FLOAT32) {
-            if (mat1->is_const) {
-                shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+    if (!params->trans_a && !params->trans_b) {
+        if (mat0->dtype == CSINN_DTYPE_FLOAT32) {
+            if (mat1->dtype == CSINN_DTYPE_FLOAT32) {
+                if (mat1->is_const) {
+                    shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK);
+                }
+                cb->exec = shl_rvv_matmul_fp32;
             }
-            cb->exec = shl_rvv_matmul_fp32;
-        } else {
-            shl_debug_error("mat1 unsupported dtype: %d\n", mat1->dtype);
-            return CSINN_FALSE;
         }
-    } else {
-        shl_debug_error("mat0 unsupported dtype: %d\n", mat0->dtype);
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "matmul is not optimized to achieve under this condition, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_matmul_quant;
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/fp32/maxpool.c b/source/thead_rvv/fp32/maxpool.c
index d100fa44..2af93ed0 100644
--- a/source/thead_rvv/fp32/maxpool.c
+++ b/source/thead_rvv/fp32/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global maxpool2d // TODO: remove
diff --git a/source/thead_rvv/fp32/maxpool_2x2_fp32.c b/source/thead_rvv/fp32/maxpool_2x2_fp32.c
index 860223b2..81d1a5c3 100644
--- a/source/thead_rvv/fp32/maxpool_2x2_fp32.c
+++ b/source/thead_rvv/fp32/maxpool_2x2_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/maxpool_2x2_fp32_packn.c b/source/thead_rvv/fp32/maxpool_2x2_fp32_packn.c
index b95d0934..9891be21 100644
--- a/source/thead_rvv/fp32/maxpool_2x2_fp32_packn.c
+++ b/source/thead_rvv/fp32/maxpool_2x2_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp32/maxpool_3x3_fp32.c b/source/thead_rvv/fp32/maxpool_3x3_fp32.c
index af874a71..bb0f8d8a 100644
--- a/source/thead_rvv/fp32/maxpool_3x3_fp32.c
+++ b/source/thead_rvv/fp32/maxpool_3x3_fp32.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/fp32/maxpool_3x3_fp32_packn.c b/source/thead_rvv/fp32/maxpool_3x3_fp32_packn.c
index 432e23b2..032a2c3b 100644
--- a/source/thead_rvv/fp32/maxpool_3x3_fp32_packn.c
+++ b/source/thead_rvv/fp32/maxpool_3x3_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp32/maxpool_fp32_nhwc.c b/source/thead_rvv/fp32/maxpool_fp32_nhwc.c
index e4ab6623..2cd8752f 100644
--- a/source/thead_rvv/fp32/maxpool_fp32_nhwc.c
+++ b/source/thead_rvv/fp32/maxpool_fp32_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp32/maxpool_fp32_packn.c b/source/thead_rvv/fp32/maxpool_fp32_packn.c
index b4844d72..aad9e9ef 100644
--- a/source/thead_rvv/fp32/maxpool_fp32_packn.c
+++ b/source/thead_rvv/fp32/maxpool_fp32_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/fp32/mul.c b/source/thead_rvv/fp32/mul.c
index 3493041d..39a1279d 100644
--- a/source/thead_rvv/fp32/mul.c
+++ b/source/thead_rvv/fp32/mul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static void elementwise_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                                  struct csinn_tensor *output)
@@ -58,6 +58,45 @@ static void broadcast_single_1_mul_fp32(struct csinn_tensor *input0, struct csin
     }
 }
 
+static inline void mul_vv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _b = vle32_v_f32m4(in1, vl);
+        vfloat32m4_t _c = vfmul_vv_f32m4(_a, _b, vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void mul_vf_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _c = vfmul_vf_f32m4(_a, in1[0], vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void mul_fv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    mul_vf_f32m4(in1, in0, out, size);
+}
+
+void *mul_cb_fp32[] = {
+    [CSINN_BROADCAST_VV] = mul_vv_f32m4,
+    [CSINN_BROADCAST_VS] = mul_vf_f32m4,
+    [CSINN_BROADCAST_SV] = mul_fv_f32m4,
+};
+
 int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
@@ -83,8 +122,7 @@ int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
         }
         broadcast_single_1_mul_fp32(input0, input1, output);
     } else {
-        /* TODO: recursive opt */
-        return shl_ref_mul_quant(input0, input1, output, params);
+        return shl_rvv_binary_op_broadcast_fp32(input0, input1, output, mul_cb_fp32);
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/fp32/pad.c b/source/thead_rvv/fp32/pad.c
index d859146f..95acb04d 100644
--- a/source/thead_rvv/fp32/pad.c
+++ b/source/thead_rvv/fp32/pad.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * params:
diff --git a/source/thead_rvv/fp32/prelu.c b/source/thead_rvv/fp32/prelu.c
index c928e3ef..d53b0f6c 100644
--- a/source/thead_rvv/fp32/prelu.c
+++ b/source/thead_rvv/fp32/prelu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_prelu_fp32(struct csinn_tensor *input, struct csinn_tensor *alpha,
                        struct csinn_tensor *output, struct csinn_prelu_params *params)
diff --git a/source/thead_rvv/fp32/relu.c b/source/thead_rvv/fp32/relu.c
index 7619b3d4..5a854275 100644
--- a/source/thead_rvv/fp32/relu.c
+++ b/source/thead_rvv/fp32/relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/fp32/relu6.c b/source/thead_rvv/fp32/relu6.c
index 2a44c59d..3f5e5648 100644
--- a/source/thead_rvv/fp32/relu6.c
+++ b/source/thead_rvv/fp32/relu6.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/fp32/reshape.c b/source/thead_rvv/fp32/reshape.c
index e1cd84e6..5d476ece 100644
--- a/source/thead_rvv/fp32/reshape.c
+++ b/source/thead_rvv/fp32/reshape.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: support flexible vlen
@@ -28,7 +28,7 @@ int shl_rvv_reshape_fp32(struct csinn_tensor *input, struct csinn_tensor *output
     float *output_data = (float *)output->data;
 
     shl_gref_reshape_infer_shape(input, output, params);
-    if (input->layout >= CSINN_LAYOUT_NC1WC0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
         const int packn = csrr_vlenb() / sizeof(float);  // = input->dim[input->dim_count - 1]
         const int vl = vsetvl_e32m1(packn);
         int outer_size = input->dim[0] * input->dim[1];  // batch fuse to outer
diff --git a/source/thead_rvv/fp32/sigmoid.c b/source/thead_rvv/fp32/sigmoid.c
index 310afb6e..68829ebd 100644
--- a/source/thead_rvv/fp32/sigmoid.c
+++ b/source/thead_rvv/fp32/sigmoid.c
@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 
+#include "rvv/rvv.h"
 #include "rvv_mathfun_fp32.h"
-#include "shl_thead_rvv.h"
 
 int shl_rvv_sigmoid_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                          struct csinn_sigmoid_params *params)
diff --git a/source/thead_rvv/fp32/softmax.c b/source/thead_rvv/fp32/softmax.c
index e4b1171a..d67c1dce 100644
--- a/source/thead_rvv/fp32/softmax.c
+++ b/source/thead_rvv/fp32/softmax.c
@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 
+#include "rvv/rvv.h"
 #include "rvv_mathfun_fp32.h"
-#include "shl_thead_rvv.h"
 
 static inline float fast_exp32(float y)
 {
diff --git a/source/thead_rvv/fp32/sub.c b/source/thead_rvv/fp32/sub.c
new file mode 100644
index 00000000..63d757dd
--- /dev/null
+++ b/source/thead_rvv/fp32/sub.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static inline void sub_vv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _b = vle32_v_f32m4(in1, vl);
+        vfloat32m4_t _c = vfsub_vv_f32m4(_a, _b, vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void sub_vf_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _a = vle32_v_f32m4(in0, vl);
+        vfloat32m4_t _c = vfsub_vf_f32m4(_a, in1[0], vl);
+        vse32_v_f32m4(out, _c, vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void sub_fv_f32m4(float *in0, float *in1, float *out, int32_t size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e32m4(size);
+        vfloat32m4_t _b = vle32_v_f32m4(in1, vl);
+        vfloat32m4_t _c = vfrsub_vf_f32m4(_b, in0[0], vl);
+        vse32_v_f32m4(out, _c, vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *sub_cb_fp32[] = {
+    [CSINN_BROADCAST_VV] = sub_vv_f32m4,
+    [CSINN_BROADCAST_VS] = sub_vf_f32m4,
+    [CSINN_BROADCAST_SV] = sub_fv_f32m4,
+};
+
+int shl_rvv_sub_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    return shl_rvv_binary_op_broadcast_fp32(input0, input1, output, sub_cb_fp32);
+}
diff --git a/source/thead_rvv/fp32/transpose.c b/source/thead_rvv/fp32/transpose.c
index 91d74b4a..1a864628 100644
--- a/source/thead_rvv/fp32/transpose.c
+++ b/source/thead_rvv/fp32/transpose.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static void transpose_021_fp32(float *src, float *dst, int batch, int inner_size, int outer_size)
 {
@@ -89,9 +89,10 @@ static int transpose_tail_coincide_fp32(struct csinn_tensor *input, struct csinn
 int shl_rvv_transpose_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_transpose_params *params)
 {
-    if (input->layout >= CSINN_LAYOUT_NC1WC0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
-        return shl_ref_transpose_quant(input, output, params);
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input);
     }
+
     if (params->permute_num == 4 && params->permute[0] == 0 && params->permute[1] == 1 &&
         params->permute[2] == 2 && params->permute[3] == 3) {
         float *input_data = (float *)input->data;
diff --git a/source/thead_rvv/int4/convolution.c b/source/thead_rvv/int4/convolution.c
index c1653771..0f518569 100644
--- a/source/thead_rvv/int4/convolution.c
+++ b/source/thead_rvv/int4/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 #ifdef SHL_USE_DOT_INT4
 int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -31,8 +31,8 @@ int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     // xxx: only int4 support nhwc layout now
@@ -43,8 +43,8 @@ int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *ou
         in_w = input->dim[2];
         kernel_h = kernel->dim[1];
         kernel_w = kernel->dim[2];
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             if (input->dtype == CSINN_DTYPE_INT4) {
                 params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
diff --git a/source/thead_rvv/int4/convolution_1x1_int4.c b/source/thead_rvv/int4/convolution_1x1_int4.c
index 9ed9e074..6d927c41 100644
--- a/source/thead_rvv/int4/convolution_1x1_int4.c
+++ b/source/thead_rvv/int4/convolution_1x1_int4.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT4
 // kernel_layout: [o, h, w, i]
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
diff --git a/source/thead_rvv/int4/convolution_1x1_int4_packn.c b/source/thead_rvv/int4/convolution_1x1_int4_packn.c
index 05579eef..ff34dabf 100644
--- a/source/thead_rvv/int4/convolution_1x1_int4_packn.c
+++ b/source/thead_rvv/int4/convolution_1x1_int4_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT4
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel,
                                                       struct csinn_conv2d_params *params)
@@ -75,8 +75,7 @@ int shl_rvv_conv1x1s1_gemm_packn_int4(struct csinn_tensor *input, struct csinn_t
             shl_rvv_reorder_input_z12_packn_int8_dot(input_ncxhwx, pb_reorder, k, n, n);
 
             shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k,
-                                                  n, n, output->qinfo->zero_point, multiplier,
-                                                  shift);
+                                                  n, output->qinfo->zero_point, multiplier, shift);
 
             shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
 
diff --git a/source/thead_rvv/int4/convolution_gemm_int4.c b/source/thead_rvv/int4/convolution_gemm_int4.c
index 92e1c1a7..3054ded8 100644
--- a/source/thead_rvv/int4/convolution_gemm_int4.c
+++ b/source/thead_rvv/int4/convolution_gemm_int4.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT4
 void shl_rvv_conv_im2col_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
                                                   struct csinn_conv2d_params *params)
diff --git a/source/thead_rvv/int4/convolution_gemm_int4_packn.c b/source/thead_rvv/int4/convolution_gemm_int4_packn.c
index 95bfb5be..3fbdc360 100644
--- a/source/thead_rvv/int4/convolution_gemm_int4_packn.c
+++ b/source/thead_rvv/int4/convolution_gemm_int4_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT4
 /*************************************************************
  * packn = vlenb / sizeof(int8_t) / 2
diff --git a/source/thead_rvv/int4/depthwise_convolution.c b/source/thead_rvv/int4/depthwise_convolution.c
index a406e44f..1a40872a 100644
--- a/source/thead_rvv/int4/depthwise_convolution.c
+++ b/source/thead_rvv/int4/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
diff --git a/source/thead_rvv/int4/depthwise_convolution_3x3_int4.c b/source/thead_rvv/int4/depthwise_convolution_3x3_int4.c
index bf567cae..a25e0278 100644
--- a/source/thead_rvv/int4/depthwise_convolution_3x3_int4.c
+++ b/source/thead_rvv/int4/depthwise_convolution_3x3_int4.c
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                int vl)
diff --git a/source/thead_rvv/int4/fullyconnected_int4.c b/source/thead_rvv/int4/fullyconnected_int4.c
index b64b5d4d..906b16ca 100644
--- a/source/thead_rvv/int4/fullyconnected_int4.c
+++ b/source/thead_rvv/int4/fullyconnected_int4.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT4
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/int4/gemm_int4_dot.c b/source/thead_rvv/int4/gemm_int4_dot.c
index 0258d293..2c1d8773 100644
--- a/source/thead_rvv/int4/gemm_int4_dot.c
+++ b/source/thead_rvv/int4/gemm_int4_dot.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT4
 static vint8mf4_t requantize_m2(vint32m2_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                 int vl)
diff --git a/source/thead_rvv/int4/gemm_int4_dot_packn.c b/source/thead_rvv/int4/gemm_int4_dot_packn.c
index 4a3a21ee..fa3d01b9 100644
--- a/source/thead_rvv/int4/gemm_int4_dot_packn.c
+++ b/source/thead_rvv/int4/gemm_int4_dot_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT4
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/int8/add.c b/source/thead_rvv/int8/add.c
index ef19c899..fa0928ec 100644
--- a/source/thead_rvv/int8/add.c
+++ b/source/thead_rvv/int8/add.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -154,6 +154,92 @@ static void broadcast_single_1_add_int8(struct csinn_tensor *input0, struct csin
     }
 }
 
+static inline void add_vv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    float s0_s2 = scale[0] / scale[2];
+    float s1_s2 = scale[1] / scale[2];
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _tmp0 = vfmul_vf_f16m2(_a_f, s0_s2, vl);  // s0/s2(q0-z0)
+        vfloat16m2_t _tmp1 = vfmul_vf_f16m2(_b_f, s1_s2, vl);  // s1/s2(q1-z1)
+        vfloat16m2_t _sumf = vfadd_vv_f16m2(_tmp0, _tmp1, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_sumf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void add_vx_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    float s0_s2 = scale[0] / scale[2];
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float q1_z1 = scale[1] / scale[2] * (in1[0] - z1);  // s1/s2(q1-z1)
+    float q1_z1_z2 = q1_z1 + z2;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _tmp0 = vfmul_vf_f16m2(_a_f, s0_s2, vl);  // s0/s2(q0-z0)
+        vfloat16m2_t _sumf = vfadd_vf_f16m2(_tmp0, q1_z1_z2, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_sumf, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void add_xv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    float s1_s2 = scale[1] / scale[2];
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float q0_z0 = scale[0] / scale[2] * (in0[0] - z0);  // s0/s2(q0-z0)
+    float q0_z0_z2 = q0_z0 + z2;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _tmp1 = vfmul_vf_f16m2(_b_f, s1_s2, vl);  // s1/s2(q1-z1)
+        vfloat16m2_t _sumf = vfadd_vf_f16m2(_tmp1, q0_z0_z2, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_sumf, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *add_cb_int8[] = {
+    [CSINN_BROADCAST_VV] = add_vv_i8_trans_f16,
+    [CSINN_BROADCAST_VS] = add_vx_i8_trans_f16,
+    [CSINN_BROADCAST_SV] = add_xv_i8_trans_f16,
+};
+
 int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
@@ -185,8 +271,7 @@ int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
         }
         broadcast_single_1_add_int8(input0, input1, output);
     } else {
-        /* TODO: recursive opt */
-        return shl_ref_add_quant(input0, input1, output, params);
+        return shl_rvv_binary_op_broadcast_int8(input0, input1, output, add_cb_int8);
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/int8/avgpool.c b/source/thead_rvv/int8/avgpool.c
index 311bba62..6dbf2c5a 100644
--- a/source/thead_rvv/int8/avgpool.c
+++ b/source/thead_rvv/int8/avgpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
@@ -47,6 +47,8 @@ int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global avgpool2d
diff --git a/source/thead_rvv/int8/avgpool_2x2_int8_packn.c b/source/thead_rvv/int8/avgpool_2x2_int8_packn.c
index 85969387..e3248e98 100644
--- a/source/thead_rvv/int8/avgpool_2x2_int8_packn.c
+++ b/source/thead_rvv/int8/avgpool_2x2_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * s2 * (q2 - z2) = avgpool_2x2{ s1 * (q1 - z1) }
diff --git a/source/thead_rvv/int8/avgpool_3x3_int8_packn.c b/source/thead_rvv/int8/avgpool_3x3_int8_packn.c
index 1dffe264..4ece6c74 100644
--- a/source/thead_rvv/int8/avgpool_3x3_int8_packn.c
+++ b/source/thead_rvv/int8/avgpool_3x3_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * s2 * (q2 - z2) = avgpool_3x3{ s1 * (q1 - z1) }
diff --git a/source/thead_rvv/int8/avgpool_int8_nhwc.c b/source/thead_rvv/int8/avgpool_int8_nhwc.c
index 424e0c25..3d36369c 100644
--- a/source/thead_rvv/int8/avgpool_int8_nhwc.c
+++ b/source/thead_rvv/int8/avgpool_int8_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * s2 * (q2 - z2) = avgpool_mxn{ s1 * (q1 - z1) }
diff --git a/source/thead_rvv/int8/avgpool_int8_packn.c b/source/thead_rvv/int8/avgpool_int8_packn.c
index 8cb211ae..959b144c 100644
--- a/source/thead_rvv/int8/avgpool_int8_packn.c
+++ b/source/thead_rvv/int8/avgpool_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * s2 * (q2 - z2) = avgpool_mxn{ s1 * (q1 - z1) }
diff --git a/source/thead_rvv/int8/clip.c b/source/thead_rvv/int8/clip.c
index c00422fa..6c3ee023 100644
--- a/source/thead_rvv/int8/clip.c
+++ b/source/thead_rvv/int8/clip.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/int8/concat.c b/source/thead_rvv/int8/concat.c
index 3dd6b727..f8198397 100644
--- a/source/thead_rvv/int8/concat.c
+++ b/source/thead_rvv/int8/concat.c
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static int shl_rvv_concat_ndarray_int8(struct csinn_tensor **input, struct csinn_tensor *output,
                                        struct csinn_concat_params *params)
diff --git a/source/thead_rvv/int8/convolution.c b/source/thead_rvv/int8/convolution.c
index 044c5e18..f2a033c2 100644
--- a/source/thead_rvv/int8/convolution.c
+++ b/source/thead_rvv/int8/convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -30,8 +30,8 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-    int32_t dalition_h = params->dilation_height;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_h = params->dilation_height;
+    int32_t dilation_w = params->dilation_width;
     struct csinn_callback *cb = params->base.cb;
 
     if (params->base.quant_type != CSINN_QUANT_INT8_ASYM_W_SYM) {
@@ -53,18 +53,21 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
         if (shl_is_first_layer_input(input, sess)) {
             in_elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     // packn
     if (in_elempack % packn == 0 && out_elempack % packn == 0) {
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             params->conv_extra.conv_mode = CSINN_GEMM;
             params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
             shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params);
             cb->exec = shl_rvv_conv1x1s1_gemm_packn_int8;
         } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-                   dalition_h == 1 && dalition_w == 1) {
+                   dilation_h == 1 && dilation_w == 1) {
             if (params->group > 1) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
                 params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
@@ -89,8 +92,8 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
     if (in_elempack % packn != 0 && out_elempack % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params);
             cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_int8;
         } else {
@@ -103,8 +106,8 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
     if (in_elempack % packn == 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params);
             cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_int8;
         } else {
@@ -117,8 +120,8 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
     if (in_elempack % packn != 0 && out_elempack % packn != 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
-        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-            dalition_w == 1) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 &&
+            dilation_w == 1) {
             shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(kernel, params);
             cb->exec = shl_rvv_conv1x1s1_gemm_int8;
         } else {
diff --git a/source/thead_rvv/int8/convolution1d.c b/source/thead_rvv/int8/convolution1d.c
index 1594a6b6..34a57a6a 100644
--- a/source/thead_rvv/int8/convolution1d.c
+++ b/source/thead_rvv/int8/convolution1d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 // TODO: support nwc layout
 int shl_rvv_conv1d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
@@ -27,7 +27,7 @@ int shl_rvv_conv1d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
     int32_t in_c = kernel->dim[1];
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_w = params->stride_width;
-    int32_t dalition_w = params->dilation_width;
+    int32_t dilation_w = params->dilation_width;
     int32_t group = params->group;
     struct csinn_callback *cb = params->base.cb;
 
@@ -37,7 +37,7 @@ int shl_rvv_conv1d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou
     }
 
     if (group == 1) {
-        if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) {
+        if (kernel_w == 1 && stride_w == 1 && dilation_w == 1) {
             // enable fuse zeropoint to bias for gemm
             if (CSINN_TRUE) {
                 int32_t *bias_data = (int32_t *)bias->data;
diff --git a/source/thead_rvv/int8/convolution1d_1_int8.c b/source/thead_rvv/int8/convolution1d_1_int8.c
index 249760dd..4d555b22 100644
--- a/source/thead_rvv/int8/convolution1d_1_int8.c
+++ b/source/thead_rvv/int8/convolution1d_1_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1d_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                              struct csinn_conv1d_params *params)
@@ -40,6 +40,10 @@ int shl_rvv_conv1d_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *ou
                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
                              struct csinn_conv1d_params *params)
 {
+    if (input->layout == CSINN_LAYOUT_NC1WC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
+    }
+
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
     int8_t *kernel_data = (int8_t *)kernel->data;
diff --git a/source/thead_rvv/int8/convolution_1x1_int8.c b/source/thead_rvv/int8/convolution_1x1_int8.c
index 31243828..66142d14 100644
--- a/source/thead_rvv/int8/convolution_1x1_int8.c
+++ b/source/thead_rvv/int8/convolution_1x1_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                                 struct csinn_conv2d_params *params)
@@ -50,9 +50,13 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
     // shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params,
+                                     void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                     void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                  int32_t *, int, int, int, int, int32_t, int32_t *,
+                                                  int32_t *))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
@@ -102,14 +106,14 @@ int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor
 
 #ifdef SHL_USE_DOT_INT8
             int8_t *pa = kernel_data + g * m * k4;
-            shl_rvv_reorder_input_z8_int8_dot(input_data, pb, k, n, n);
-            shl_rvv_gemm_8x8_int8_dot(pc, pa, pb, bias_data + g * m, m, k4, n, n,
-                                      output->qinfo->zero_point, multiplier, shift);
+            reorder_input(input_data, pb, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k4, n, n, output->qinfo->zero_point, multiplier,
+                 shift);
 #else
             int8_t *pa = kernel_data + g * m * k;
-            shl_rvv_reorder_input_z16_int8_v128(input_data, pb, k, n, n);
-            shl_rvv_gemm_4x16_int8_v128(pc, pa, pb, bias_data + g * m, m, k, n, n,
-                                        output->qinfo->zero_point, multiplier, shift);
+            reorder_input(input_data, pb, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k, n, n, output->qinfo->zero_point, multiplier,
+                 shift);
 #endif  // SHL_USE_DOT_INT8
             input_data += k * n;
             output_data += m * n;
@@ -120,3 +124,18 @@ int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor
     shl_mem_free(shift);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv1x1_gemm_int8(input, output, kernel, bias, params,
+                                            shl_rvv_reorder_input_z8_int8_dot,
+                                            shl_rvv_gemm_8x8_int8_dot);
+#else
+    return shl_rvv_common_conv1x1_gemm_int8(input, output, kernel, bias, params,
+                                            shl_rvv_reorder_input_z16_int8_v128,
+                                            shl_rvv_gemm_4x16_int8_v128);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/convolution_1x1_int8_pack1ton.c b/source/thead_rvv/int8/convolution_1x1_int8_pack1ton.c
index 8260efb2..116adbb0 100644
--- a/source/thead_rvv/int8/convolution_1x1_int8_pack1ton.c
+++ b/source/thead_rvv/int8/convolution_1x1_int8_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************************************
  * reorder kernel_data inplace, means the origin kernel_data be destoried.
@@ -54,9 +54,12 @@ static void reorder_input_pack1ton_align4_int8(const int8_t *src, int8_t *dst, i
 }
 #endif  // SHL_USE_DOT_INT8
 
-int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_pack1ton_int8(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(int8_t *, int8_t *, int, int, int, int),
+    void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *, int, int, int, int32_t,
+                 int32_t *, int32_t *))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
@@ -118,15 +121,15 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csin
 #ifdef SHL_USE_DOT_INT8
             int8_t *kernel_ptr = kernel_data + g * m * k4;
             reorder_input_pack1ton_align4_int8(input_data, input_ncxhwx, k, out_h, out_w);
-            shl_rvv_reorder_input_z12_pack1ton_int8_dot(input_ncxhwx, in_ptr, k4, 1, n, n);
-            shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n,
-                                                  n, output->qinfo->zero_point, multiplier, shift);
+            reorder_input(input_ncxhwx, in_ptr, k4, 1, n, n);
+            gemm(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n, output->qinfo->zero_point,
+                 multiplier, shift);
 #else
             int8_t *kernel_ptr = kernel_data + g * m * k;
             shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, k, out_h, out_w);
-            shl_rvv_reorder_input_z4_pack1ton_int8(input_ncxhwx, in_ptr, k, 1, n, n);
-            shl_rvv_ncxhwx_gemm_4xpack2n_int8(output_data, kernel_ptr, in_ptr, bias_ptr, m, k, n, n,
-                                              output->qinfo->zero_point, multiplier, shift);
+            reorder_input(input_ncxhwx, in_ptr, k, 1, n, n);
+            gemm(output_data, kernel_ptr, in_ptr, bias_ptr, m, k, n, output->qinfo->zero_point,
+                 multiplier, shift);
 #endif  // SHL_USE_DOT_INT8
             input_data += k * n;
             output_data += m * n;
@@ -138,3 +141,18 @@ int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csin
     shl_mem_free(input_ncxhwx);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv1x1_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_pack1ton_int8_dot,
+                                                     shl_rvv_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv1x1_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z4_pack1ton_int8,
+                                                     shl_rvv_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/convolution_1x1_int8_packn.c b/source/thead_rvv/int8/convolution_1x1_int8_packn.c
index 4174550c..efb643c2 100644
--- a/source/thead_rvv/int8/convolution_1x1_int8_packn.c
+++ b/source/thead_rvv/int8/convolution_1x1_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
                                                       struct csinn_conv2d_params *params)
@@ -24,9 +24,13 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kerne
     shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                      struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                           void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                        int32_t *, int, int, int, int32_t,
+                                                        int32_t *, int32_t *))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
@@ -77,15 +81,10 @@ int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_t
                 }
             }
 
-#ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(input_data, pb_reorder, k, n, n);
-            shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n,
-                                                  output->qinfo->zero_point, multiplier, shift);
-#else
-            shl_rvv_reorder_input_z4_packn_int8(input_data, pb_reorder, k, n, n);
-            shl_rvv_ncxhwx_gemm_4xpack2n_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n,
-                                              output->qinfo->zero_point, multiplier, shift);
-#endif  // SHL_USE_DOT_INT8
+            reorder_input(input_data, pb_reorder, k, n, n);
+            gemm(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, output->qinfo->zero_point,
+                 multiplier, shift);
+
             input_data += k * n;
             output_data += m * n;
         }
@@ -95,3 +94,18 @@ int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_t
     shl_mem_free(shift);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv1x1_gemm_packn_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                  shl_rvv_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv1x1_gemm_packn_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z4_packn_int8,
+                                                  shl_rvv_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/convolution_1x1_int8_packnto1.c b/source/thead_rvv/int8/convolution_1x1_int8_packnto1.c
index a3d4647a..7880c34a 100644
--- a/source/thead_rvv/int8/convolution_1x1_int8_packnto1.c
+++ b/source/thead_rvv/int8/convolution_1x1_int8_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
                                                          struct csinn_conv2d_params *params)
@@ -24,9 +24,12 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *ke
     shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params);
 }
 
-int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                         struct csinn_conv2d_params *params)
+int shl_rvv_common_conv1x1_gemm_packnto1_int8(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+    void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *, int, int, int, int32_t,
+                 int32_t *, int32_t *))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
@@ -72,16 +75,10 @@ int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csin
                 }
             }
 
-#ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(input_data, pb_reorder, k, n, n);
-            shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k,
-                                                  n, n, output->qinfo->zero_point, multiplier,
-                                                  shift);
-#else
-            shl_rvv_reorder_input_z4_packn_int8(input_data, pb_reorder, k, n, n);
-            shl_rvv_ncxhwx_gemm_4xpack2n_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
-                                              n, output->qinfo->zero_point, multiplier, shift);
-#endif  // SHL_USE_DOT_INT8
+            reorder_input(input_data, pb_reorder, k, n, n);
+            gemm(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, output->qinfo->zero_point,
+                 multiplier, shift);
+
             shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
 
             input_data += k * n;
@@ -94,3 +91,18 @@ int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csin
     shl_mem_free(output_ncxhwx);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv1x1_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                     shl_rvv_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv1x1_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                     shl_rvv_reorder_input_z4_packn_int8,
+                                                     shl_rvv_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/convolution_3x3_int8_packn.c b/source/thead_rvv/int8/convolution_3x3_int8_packn.c
index becb0c1d..efa5398a 100644
--- a/source/thead_rvv/int8/convolution_3x3_int8_packn.c
+++ b/source/thead_rvv/int8/convolution_3x3_int8_packn.c
@@ -14,7 +14,10 @@
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */#include "shl_c908.h"
+ */
+
+#include "c908/c908.h"
+
 /*************************************************************
     note: VLEN = 128
 *************************************************************/
diff --git a/source/thead_rvv/int8/convolution_gemm_int8.c b/source/thead_rvv/int8/convolution_gemm_int8.c
index 0374da20..24a42659 100644
--- a/source/thead_rvv/int8/convolution_gemm_int8.c
+++ b/source/thead_rvv/int8/convolution_gemm_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
                                                   struct csinn_conv2d_params *params)
@@ -50,9 +50,12 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
     // shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+int shl_rvv_common_conv_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                  struct csinn_conv2d_params *params)
+                                  struct csinn_conv2d_params *params,
+                                  void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                  void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *,
+                                               int, int, int, int, int32_t, int32_t *, int32_t *))
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
@@ -144,14 +147,14 @@ int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tenso
 
 #ifdef SHL_USE_DOT_INT8
             int8_t *pa = kernel_data + g * m * k4;
-            shl_rvv_reorder_input_z8_int8_dot(im2col_data, pb, k, n, n);
-            shl_rvv_gemm_8x8_int8_dot(pc, pa, pb, bias_data + g * m, m, k4, n, n,
-                                      output->qinfo->zero_point, multiplier, shift);
+            reorder_input(im2col_data, pb, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k4, n, n, output->qinfo->zero_point, multiplier,
+                 shift);
 #else
             int8_t *pa = kernel_data + g * m * k;
-            shl_rvv_reorder_input_z16_int8_v128(im2col_data, pb, k, n, n);
-            shl_rvv_gemm_4x16_int8_v128(pc, pa, pb, bias_data + g * m, m, k, n, n,
-                                        output->qinfo->zero_point, multiplier, shift);
+            reorder_input(im2col_data, pb, k, n, n);
+            gemm(pc, pa, pb, bias_data + g * m, m, k, n, n, output->qinfo->zero_point, multiplier,
+                 shift);
 #endif  // SHL_USE_DOT_INT8
             input_data += in_ch / group * in_height * in_width;
             output_data += m * n;
@@ -163,3 +166,18 @@ int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tenso
     shl_mem_free(shift);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv_gemm_int8(input, output, kernel, bias, params,
+                                         shl_rvv_reorder_input_z8_int8_dot,
+                                         shl_rvv_gemm_8x8_int8_dot);
+#else
+    return shl_rvv_common_conv_gemm_int8(input, output, kernel, bias, params,
+                                         shl_rvv_reorder_input_z16_int8_v128,
+                                         shl_rvv_gemm_4x16_int8_v128);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c b/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c
index 54679347..e86e22ea 100644
--- a/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c
+++ b/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(int8_t) / 2
@@ -179,9 +179,13 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *
 #endif  // SHL_USE_DOT_INT8
 }
 
-int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                           struct csinn_conv2d_params *params)
+int shl_rvv_common_conv_gemm_pack1ton_int8(
+    struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel,
+    struct csinn_tensor *bias, struct csinn_conv2d_params *params,
+    void (*reorder_input)(int8_t *, int8_t *, int, int, int, int),
+    void (*gemm)(int8_t *, const int8_t *, const int8_t *, int32_t *, int, int, int, int32_t,
+                 int32_t *, int32_t *))
+
 {
     if (input->layout == CSINN_LAYOUT_NC1HWC0) {
         shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
@@ -286,14 +290,12 @@ int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct cs
             shl_mem_free(input_pad_buf);
             // reorder
             int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t));
-            shl_rvv_reorder_input_z12_pack1ton_int8_dot(im2col_buf, reorder_buf, in_cp4, maxk, n,
-                                                        n);
+            reorder_input(im2col_buf, reorder_buf, in_cp4, maxk, n, n);
             shl_mem_free(im2col_buf);
             int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp4;
             // gemm
-            shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                  in_cp4 * maxk, n, n, output->qinfo->zero_point,
-                                                  multiplier, shift);
+            gemm(output_data, ker_ptr, reorder_buf, bias_ptr, m, in_cp4 * maxk, n,
+                 output->qinfo->zero_point, multiplier, shift);
 #else
             // im2col
             // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
@@ -304,6 +306,29 @@ int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct cs
             int8_t *dst_ptr = im2col_buf;
 
             int loop_c = in_cp;
+#ifdef RVV_1_0_0
+            while (loop_c > 0) {
+                vl = vsetvl_e8mf2(loop_c);
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const int8_t *img1 =
+                            img0 + a * dilation_h * padded_in_w * vl + b * dilation_w * vl;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * vl;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += vl;
+                            }
+                            img1 += tailstep * vl;
+                        }
+                    }
+                }
+                img0 += padded_in_hw * vl;
+                loop_c -= vl;
+            }
+#elif defined RVV_0_7_1
             while (loop_c > 0) {
                 vl = vsetvl_e8m1(loop_c > packn ? packn : loop_c);
                 for (int a = 0; a < ksize_h; a++) {
@@ -325,17 +350,17 @@ int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct cs
                 img0 += padded_in_hw * vl;
                 loop_c -= vl;
             }
+#endif
             shl_mem_free(input_pad_buf);
 
             // reorder(pack)
             int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
-            shl_rvv_reorder_input_z4_pack1ton_int8(im2col_buf, reorder_buf, in_cp, maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp, maxk, n, n);
             shl_mem_free(im2col_buf);
             int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
             // gemm
-            shl_rvv_ncxhwx_gemm_4xpack2n_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                              in_cp * maxk, n, n, output->qinfo->zero_point,
-                                              multiplier, shift);
+            gemm(output_data, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n,
+                 output->qinfo->zero_point, multiplier, shift);
 #endif  // SHL_USE_DOT_INT8
 
             shl_mem_free(reorder_buf);
@@ -347,3 +372,18 @@ int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct cs
     shl_mem_free(shift);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_pack1ton_int8_dot,
+                                                  shl_rvv_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv_gemm_pack1ton_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z4_pack1ton_int8,
+                                                  shl_rvv_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/convolution_gemm_int8_packn.c b/source/thead_rvv/int8/convolution_gemm_int8_packn.c
index 9fd75c44..fbdba88f 100644
--- a/source/thead_rvv/int8/convolution_gemm_int8_packn.c
+++ b/source/thead_rvv/int8/convolution_gemm_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(int8_t) / 2
@@ -118,9 +118,13 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *ker
     // shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+int shl_rvv_common_conv_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                        struct csinn_conv2d_params *params)
+                                        struct csinn_conv2d_params *params,
+                                        void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                        void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                     int32_t *, int, int, int, int32_t, int32_t *,
+                                                     int32_t *))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
@@ -174,7 +178,11 @@ int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn
 
             // im2col
             const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+#ifdef RVV_1_0_0
+            const int vl = vsetvl_e8mf2(packn);
+#elif defined RVV_0_7_1
             const int vl = vsetvl_e8m1(packn);
+#endif
 
             // [in_c/packn, maxk, out_h, out_w, packn]
             int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
@@ -192,10 +200,17 @@ int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn
 
                         for (int p = 0; p < out_h; p++) {
                             for (int q = 0; q < out_w; q++) {
+#ifdef RVV_1_0_0
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * packn;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+#elif defined RVV_0_7_1
                                 vint8m1_t _tmp = vle8_v_i8m1(img1, vl);
                                 img1 += stride_w * packn;
                                 vse8_v_i8m1(dst_ptr, _tmp, vl);
                                 dst_ptr += packn;
+#endif
                             }
                             img1 += tailstep;
                         }
@@ -221,19 +236,11 @@ int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn
 
             // reorder(pack)
             int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
-#ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-            shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                                  in_cp * maxk, n, n, output->qinfo->zero_point,
-                                                  multiplier, shift);
-#else
-            shl_rvv_reorder_input_z4_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+
+            reorder_input(im2col_buf, reorder_buf, in_cp * maxk, n, n);
             shl_mem_free(im2col_buf);
-            shl_rvv_ncxhwx_gemm_4xpack2n_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
-                                              in_cp * maxk, n, n, output->qinfo->zero_point,
-                                              multiplier, shift);
-#endif  // SHL_USE_DOT_INT8
+            gemm(output_data, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n,
+                 output->qinfo->zero_point, multiplier, shift);
 
             shl_mem_free(reorder_buf);
             input_data += in_cp * in_h * in_w;
@@ -244,3 +251,18 @@ int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn
     shl_mem_free(shift);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv_gemm_packn_int8(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z12_packn_int8_dot,
+                                               shl_rvv_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv_gemm_packn_int8(input, output, kernel, bias, params,
+                                               shl_rvv_reorder_input_z4_packn_int8,
+                                               shl_rvv_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c b/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c
index 8f15ccd1..cb228d66 100644
--- a/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c
+++ b/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * packn = vlenb / sizeof(int8_t) / 2
@@ -160,9 +160,13 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *
     // shl_mem_free(pa_reorder);
 }
 
-int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+int shl_rvv_common_conv_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
-                                           struct csinn_conv2d_params *params)
+                                           struct csinn_conv2d_params *params,
+                                           void (*reorder_input)(int8_t *, int8_t *, int, int, int),
+                                           void (*gemm)(int8_t *, const int8_t *, const int8_t *,
+                                                        int32_t *, int, int, int, int32_t,
+                                                        int32_t *, int32_t *))
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
         shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(input);
@@ -210,7 +214,11 @@ int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct cs
 
             // im2col
             const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+#ifdef RVV_1_0_0
+            const int vl = vsetvl_e8mf2(packn);
+#elif defined RVV_0_7_1
             const int vl = vsetvl_e8m1(packn);
+#endif
 
             // [in_c/packn, maxk, out_h, out_w, packn]
             int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
@@ -228,10 +236,17 @@ int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct cs
 
                         for (int p = 0; p < out_h; p++) {
                             for (int q = 0; q < out_w; q++) {
+#ifdef RVV_1_0_0
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * packn;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+#elif defined RVV_0_7_1
                                 vint8m1_t _tmp = vle8_v_i8m1(img1, vl);
                                 img1 += stride_w * packn;
                                 vse8_v_i8m1(dst_ptr, _tmp, vl);
                                 dst_ptr += packn;
+#endif
                             }
                             img1 += tailstep;
                         }
@@ -257,19 +272,11 @@ int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct cs
 
             int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
 
-#ifdef SHL_USE_DOT_INT8
-            shl_rvv_reorder_input_z12_packn_int8_dot(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            reorder_input(im2col_buf, reorder_buf, in_cp * maxk, n, n);
             shl_mem_free(im2col_buf);
-            shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                                  in_cp * maxk, n, n, output->qinfo->zero_point,
-                                                  multiplier, shift);
-#else
-            shl_rvv_reorder_input_z4_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
-            shl_mem_free(im2col_buf);
-            shl_rvv_ncxhwx_gemm_4xpack2n_int8(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
-                                              in_cp * maxk, n, n, output->qinfo->zero_point,
-                                              multiplier, shift);
-#endif  // SHL_USE_DOT_INT8
+            gemm(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, in_cp * maxk, n,
+                 output->qinfo->zero_point, multiplier, shift);
+
             shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
             shl_mem_free(reorder_buf);
 
@@ -282,3 +289,18 @@ int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct cs
     shl_mem_free(output_ncxhwx);
     return CSINN_TRUE;
 }
+
+int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_common_conv_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z12_packn_int8_dot,
+                                                  shl_rvv_ncxhwx_gemm_12xpackn_int8_dot);
+#else
+    return shl_rvv_common_conv_gemm_packnto1_int8(input, output, kernel, bias, params,
+                                                  shl_rvv_reorder_input_z4_packn_int8,
+                                                  shl_rvv_ncxhwx_gemm_4xpack2n_int8);
+#endif  // SHL_USE_DOT_INT8
+}
diff --git a/source/thead_rvv/int8/depthwise_convolution.c b/source/thead_rvv/int8/depthwise_convolution.c
index 4b157ff1..53a6ca30 100644
--- a/source/thead_rvv/int8/depthwise_convolution.c
+++ b/source/thead_rvv/int8/depthwise_convolution.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
@@ -50,6 +50,9 @@ int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_
             in_elempack = 1;
             out_elempack = 1;  // dwconv2d out_channel pack is same as in_channel
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        in_elempack = in_c % packn == 0 ? packn : 1;
+        out_elempack = out_c % packn == 0 ? packn : 1;
     }
 
     // enable fuse zeropoint to bias
diff --git a/source/thead_rvv/int8/depthwise_convolution1d_int8.c b/source/thead_rvv/int8/depthwise_convolution1d_int8.c
index b58a4317..df6e95b2 100644
--- a/source/thead_rvv/int8/depthwise_convolution1d_int8.c
+++ b/source/thead_rvv/int8/depthwise_convolution1d_int8.c
@@ -16,12 +16,16 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_dwconv1d_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
                           struct csinn_conv1d_params *params)
 {
+    if (input->layout == CSINN_LAYOUT_NC1WC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
+    }
+
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
     int8_t *kernel_data = (int8_t *)kernel->data;
diff --git a/source/thead_rvv/int8/depthwise_convolution_3x3_int8.c b/source/thead_rvv/int8/depthwise_convolution_3x3_int8.c
index bc2404b5..dc0fb82c 100644
--- a/source/thead_rvv/int8/depthwise_convolution_3x3_int8.c
+++ b/source/thead_rvv/int8/depthwise_convolution_3x3_int8.c
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                int vl)
diff --git a/source/thead_rvv/int8/depthwise_convolution_3x3_int8_packn.c b/source/thead_rvv/int8/depthwise_convolution_3x3_int8_packn.c
index d9deb70f..3f0f16f7 100644
--- a/source/thead_rvv/int8/depthwise_convolution_3x3_int8_packn.c
+++ b/source/thead_rvv/int8/depthwise_convolution_3x3_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 #ifdef RVV_1_0_0
 static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift,
diff --git a/source/thead_rvv/int8/depthwise_convolution_int8_nhwc.c b/source/thead_rvv/int8/depthwise_convolution_int8_nhwc.c
index c51bf5b2..ce4713c1 100644
--- a/source/thead_rvv/int8/depthwise_convolution_int8_nhwc.c
+++ b/source/thead_rvv/int8/depthwise_convolution_int8_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/int8/depthwise_convolution_int8_packn.c b/source/thead_rvv/int8/depthwise_convolution_int8_packn.c
index 43168feb..cdf5ed05 100644
--- a/source/thead_rvv/int8/depthwise_convolution_int8_packn.c
+++ b/source/thead_rvv/int8/depthwise_convolution_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/int8/div.c b/source/thead_rvv/int8/div.c
new file mode 100644
index 00000000..f24798bd
--- /dev/null
+++ b/source/thead_rvv/int8/div.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static inline void div_vv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float real_scale = scale[0] / scale[1] / scale[2];
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _divf = vfdiv_vv_f16m2(_a_f, _b_f, vl);
+        _divf = vfmul_vf_f16m2(_divf, real_scale, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_divf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void div_vx_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float real_scale = scale[0] / scale[1] / scale[2];
+    float b_f = in1[0] - z1;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _divf = vfdiv_vf_f16m2(_a_f, b_f, vl);
+        _divf = vfmul_vf_f16m2(_divf, real_scale, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_divf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void div_xv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float real_scale = scale[0] / scale[1] / scale[2];
+    float a_f = in0[0] - z0;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _divf = vfrdiv_vf_f16m2(_b_f, a_f, vl);
+        _divf = vfmul_vf_f16m2(_divf, real_scale, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_divf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *div_cb_int8[] = {
+    [CSINN_BROADCAST_VV] = div_vv_i8_trans_f16,
+    [CSINN_BROADCAST_VS] = div_vx_i8_trans_f16,
+    [CSINN_BROADCAST_SV] = div_xv_i8_trans_f16,
+};
+
+int shl_rvv_div_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    return shl_rvv_binary_op_broadcast_int8(input0, input1, output, div_cb_int8);
+}
diff --git a/source/thead_rvv/int8/erf.c b/source/thead_rvv/int8/erf.c
new file mode 100644
index 00000000..d8653b99
--- /dev/null
+++ b/source/thead_rvv/int8/erf.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_erf_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
+{
+    return shl_rvv_siso_callback_base(input, output, params, shl_rvv_erf_fp32);
+}
diff --git a/source/thead_rvv/int8/fullyconnected.c b/source/thead_rvv/int8/fullyconnected.c
index a02bffd2..927048b6 100644
--- a/source/thead_rvv/int8/fullyconnected.c
+++ b/source/thead_rvv/int8/fullyconnected.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_fullyconnected_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
diff --git a/source/thead_rvv/int8/fullyconnected_int8.c b/source/thead_rvv/int8/fullyconnected_int8.c
index ec2522c8..f8bf1380 100644
--- a/source/thead_rvv/int8/fullyconnected_int8.c
+++ b/source/thead_rvv/int8/fullyconnected_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -129,6 +129,10 @@ int shl_rvv_fullyconnected_packn_int8(struct csinn_tensor *input, struct csinn_t
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
                                       struct csinn_fc_params *params)
 {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
+    }
+
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
     int8_t *weights_data = (int8_t *)weights->data;
diff --git a/source/thead_rvv/int8/gather.c b/source/thead_rvv/int8/gather.c
index 9f6038ab..217fa521 100644
--- a/source/thead_rvv/int8/gather.c
+++ b/source/thead_rvv/int8/gather.c
@@ -16,12 +16,16 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static int shl_rvv_gather_in_i8_out_f16(struct csinn_tensor *input, struct csinn_tensor *indices,
                                         struct csinn_tensor *output,
                                         struct csinn_gather_params *params)
 {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
+    }
+
     int input_size = csinn_tensor_size(input);
     if (input_size == 0) {
         return CSINN_TRUE;
@@ -72,6 +76,9 @@ int shl_rvv_gather_int8(struct csinn_tensor *input, struct csinn_tensor *indices
     if (input->dtype == CSINN_DTYPE_INT8 && output->dtype == CSINN_DTYPE_FLOAT16) {
         return shl_rvv_gather_in_i8_out_f16(input, indices, output, params);
     } else if (input->dtype == CSINN_DTYPE_INT8 && output->dtype == CSINN_DTYPE_INT8) {
+        if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+            shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
+        }
         return shl_ref_gather_int8(input, indices, output, params);
     } else {
         return shl_ref_gather_quant(input, indices, output, params);
diff --git a/source/thead_rvv/int8/gemm_int8.c b/source/thead_rvv/int8/gemm_int8.c
index 34dfe70f..47664a78 100644
--- a/source/thead_rvv/int8/gemm_int8.c
+++ b/source/thead_rvv/int8/gemm_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                int vl)
diff --git a/source/thead_rvv/int8/gemm_int8_4xn.c b/source/thead_rvv/int8/gemm_int8_4xn.c
index 05ec4fd5..bd396fec 100644
--- a/source/thead_rvv/int8/gemm_int8_4xn.c
+++ b/source/thead_rvv/int8/gemm_int8_4xn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 static vint8m1_t requantize_4xn(vint32m4_t _src, int32_t *mult, int32_t *shift, int32_t out_zp,
                                 int vl)
 {
diff --git a/source/thead_rvv/int8/gemm_int8_dot.c b/source/thead_rvv/int8/gemm_int8_dot.c
index f02777b8..9c31af9b 100644
--- a/source/thead_rvv/int8/gemm_int8_dot.c
+++ b/source/thead_rvv/int8/gemm_int8_dot.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT8
 static vint8mf2_t requantize_m2(vint32m2_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                 int vl)
diff --git a/source/thead_rvv/int8/gemm_int8_dot_packn.c b/source/thead_rvv/int8/gemm_int8_dot_packn.c
index f1339804..9d5c745d 100644
--- a/source/thead_rvv/int8/gemm_int8_dot_packn.c
+++ b/source/thead_rvv/int8/gemm_int8_dot_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 #ifdef SHL_USE_DOT_INT8
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
@@ -40,9 +40,10 @@ static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint3
  * sa - kernel:  [m/packn, k, packn]
  * sb - input:   [n/12, k, 12]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                           int32_t *bias, int m, int k, int n, int ldc,
-                                           int32_t out_zp, int32_t *mult, int32_t *shift)
+                                           int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                           int32_t *mult, int32_t *shift)
 {
     int8_t *kernel_data = (int8_t *)sa;
     int8_t *input_data = (int8_t *)sb;
@@ -422,9 +423,10 @@ void shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const
  * sa - kernel:  [m/packn, k, packn]
  * sb - input:   [n/8, k, 8]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_rvv_ncxhwx_gemm_8xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                          int32_t *bias, int m, int k, int n, int ldc,
-                                          int32_t out_zp, int32_t *mult, int32_t *shift)
+                                          int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                          int32_t *mult, int32_t *shift)
 {
     int8_t *kernel_data = (int8_t *)sa;
     int8_t *input_data = (int8_t *)sb;
diff --git a/source/thead_rvv/int8/gemm_int8_packn.c b/source/thead_rvv/int8/gemm_int8_packn.c
index 2568dfca..169962d0 100644
--- a/source/thead_rvv/int8/gemm_int8_packn.c
+++ b/source/thead_rvv/int8/gemm_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
@@ -40,8 +40,9 @@ static vint8m1_t requantize_m4_s(vint32m4_t _src, vint32m4_t _multiplier, vint32
  * sa - kernel:  [m/packn, k, packn]
  * sb - input:   [n/4, k, 4]
  **************************************************************/
+// XXX: unsupported fuse relu
 void shl_rvv_ncxhwx_gemm_4xpack2n_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
-                                       int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                       int32_t *bias, int m, int k, int n, int32_t out_zp,
                                        int32_t *mult, int32_t *shift)
 {
     int8_t *kernel_data = (int8_t *)sa;
diff --git a/source/thead_rvv/int8/global_avgpool_nhwc.c b/source/thead_rvv/int8/global_avgpool_nhwc.c
index b4060458..695b1490 100644
--- a/source/thead_rvv/int8/global_avgpool_nhwc.c
+++ b/source/thead_rvv/int8/global_avgpool_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/int8/global_avgpool_packn.c b/source/thead_rvv/int8/global_avgpool_packn.c
index 821a0e71..96a4514c 100644
--- a/source/thead_rvv/int8/global_avgpool_packn.c
+++ b/source/thead_rvv/int8/global_avgpool_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/int8/global_maxpool_nhwc.c b/source/thead_rvv/int8/global_maxpool_nhwc.c
index e31fc076..b09ebe86 100644
--- a/source/thead_rvv/int8/global_maxpool_nhwc.c
+++ b/source/thead_rvv/int8/global_maxpool_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
diff --git a/source/thead_rvv/int8/global_maxpool_packn.c b/source/thead_rvv/int8/global_maxpool_packn.c
index 454fa2a9..37fe467b 100644
--- a/source/thead_rvv/int8/global_maxpool_packn.c
+++ b/source/thead_rvv/int8/global_maxpool_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: VLEN = 128/256 ... flexible vlen
diff --git a/source/thead_rvv/int8/layer_norm.c b/source/thead_rvv/int8/layer_norm.c
index bce25848..e5c5942a 100644
--- a/source/thead_rvv/int8/layer_norm.c
+++ b/source/thead_rvv/int8/layer_norm.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: support flexible vlen
 *************************************************************/
 
 // FIXME: precision loss
-int shl_rvv_layer_norm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                            struct csinn_tensor *gamma, struct csinn_tensor *beta,
-                            struct csinn_layer_norm_params *params)
+int layer_norm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                    struct csinn_layer_norm_params *params)
 {
     if (params->center == false || params->scale == false) {
         shl_debug_error("Layer norm only support center & scale == true\n");
@@ -163,3 +163,53 @@ int shl_rvv_layer_norm_int8(struct csinn_tensor *input, struct csinn_tensor *out
 
     return CSINN_TRUE;
 }
+
+int shl_rvv_layer_norm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                            struct csinn_layer_norm_params *params)
+{
+    struct csinn_tensor *float_input = shl_rvv_tensor_transform_f32(input);
+    struct csinn_tensor *float_output = shl_rvv_tensor_transform_f32(output);
+    struct csinn_tensor *float_gamma = shl_rvv_tensor_transform_f32(gamma);
+    struct csinn_tensor *float_beta = shl_rvv_tensor_transform_f32(beta);
+    if (float_input == NULL) {
+        shl_debug_warning(
+            "shl_rvv_tensor_transform_f32 is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        float_input = shl_ref_tensor_transform_f32(input);
+    }
+    if (float_output == NULL) {
+        shl_debug_warning(
+            "shl_rvv_tensor_transform_f32 is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        float_output = shl_ref_tensor_transform_f32(output);
+    }
+    if (float_gamma == NULL) {
+        shl_debug_warning(
+            "shl_rvv_tensor_transform_f32 is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        float_gamma = shl_ref_tensor_transform_f32(gamma);
+    }
+    if (float_beta == NULL) {
+        shl_debug_warning(
+            "shl_rvv_tensor_transform_f32 is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        float_beta = shl_ref_tensor_transform_f32(beta);
+    }
+
+    int ret = shl_rvv_layer_norm_fp32(float_input, float_output, float_gamma, float_beta, params);
+
+    if (shl_rvv_tensor_data_convert(float_output, output) != CSINN_TRUE) {
+        shl_debug_warning(
+            "shl_rvv_tensor_data_convert is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        csinn_tensor_data_convert(output, float_output);
+    }
+
+    shl_ref_tensor_transform_free_f32(float_input);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_gamma);
+    shl_ref_tensor_transform_free_f32(float_beta);
+
+    return ret;
+}
diff --git a/source/thead_rvv/int8/leaky_relu.c b/source/thead_rvv/int8/leaky_relu.c
index f3720404..61239750 100644
--- a/source/thead_rvv/int8/leaky_relu.c
+++ b/source/thead_rvv/int8/leaky_relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/int8/matmul.c b/source/thead_rvv/int8/matmul.c
new file mode 100644
index 00000000..0c94d140
--- /dev/null
+++ b/source/thead_rvv/int8/matmul.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_matmul_common_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                               struct csinn_tensor *output, struct csinn_matmul_params *params,
+                               void (*reorder_mat0)(int8_t *, int8_t *, int, int, int),
+                               void (*reorder_mat1)(int8_t *, int8_t *, int, int, int),
+                               void (*matmul)(int8_t *, const int8_t *, const int8_t *, int, int,
+                                              int, int, int32_t, int32_t, int32_t, int32_t,
+                                              int32_t))
+{
+    if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(mat0);
+    }
+    if (mat1->layout >= CSINN_LAYOUT_NC1C0 && mat1->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(mat1);
+    }
+
+    int8_t *mat0_data = (int8_t *)mat0->data;
+    int8_t *mat1_data = (int8_t *)mat1->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    const int dims_count = mat0->dim_count;
+    int batches_a = 1;
+    int batches_b = 1;
+
+    /* compute the outer size */
+    for (int i = 0; i < dims_count - 2; i++) {
+        batches_a *= mat0->dim[i];
+    }
+    for (int i = 0; i < mat1->dim_count - 2; i++) {
+        batches_b *= mat1->dim[i];
+    }
+
+    const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
+    const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
+    const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)];
+
+    int32_t z1 = mat0->qinfo->zero_point;
+    int32_t z2 = mat1->qinfo->zero_point;
+    int32_t z3 = output->qinfo->zero_point;
+    int32_t multiplier;
+    int32_t shift;
+    float real_scale = mat0->qinfo->scale * mat1->qinfo->scale / output->qinfo->scale;
+    shl_quantize_multiplier(real_scale, &multiplier, &shift);
+
+    if (!params->trans_a && !params->trans_b) {
+        if (batches_a == batches_b) {
+            int8_t *in0 = (int8_t *)shl_mem_alloc(dim_m * dim_k * sizeof(int8_t));
+            int8_t *in1;
+            if (!(mat1->is_const)) {
+                in1 = (int8_t *)shl_mem_alloc(dim_k * dim_n * sizeof(int8_t));
+            }
+
+            for (int b = 0; b < batches_a; b++) {
+                reorder_mat0(mat0_data, in0, dim_m, dim_k, dim_k);
+                if (!(mat1->is_const)) {
+                    reorder_mat1(mat1_data, in1, dim_k, dim_n, dim_n);
+                } else {
+                    in1 = mat1_data;
+                }
+
+                matmul(output_data, in0, in1, dim_m, dim_k, dim_n, dim_n, z1, z2, z3, multiplier,
+                       shift);
+
+                mat0_data += dim_m * dim_k;
+                mat1_data += dim_k * dim_n;
+                output_data += dim_m * dim_n;
+            }
+            shl_mem_free(in0);
+            if (!(mat1->is_const)) {
+                shl_mem_free(in1);
+            }
+        } else if (batches_a > 1 && batches_b == 1) {
+            int8_t *in0 = (int8_t *)shl_mem_alloc(dim_m * dim_k * sizeof(int8_t));
+            int8_t *in1;
+            if (!(mat1->is_const)) {
+                in1 = (int8_t *)shl_mem_alloc(dim_k * dim_n * sizeof(int8_t));
+                reorder_mat1(mat1_data, in1, dim_k, dim_n, dim_n);
+            } else {
+                in1 = mat1_data;
+            }
+
+            for (int b = 0; b < batches_a; b++) {
+                reorder_mat0(mat0_data, in0, dim_m, dim_k, dim_k);
+                matmul(output_data, in0, in1, dim_m, dim_k, dim_n, dim_n, z1, z2, z3, multiplier,
+                       shift);
+
+                mat0_data += dim_m * dim_k;
+                output_data += dim_m * dim_n;
+            }
+            shl_mem_free(in0);
+            if (!(mat1->is_const)) {
+                shl_mem_free(in1);
+            }
+        } else {
+            shl_debug_error("matmul unsupported this broadcast\n");
+            return CSINN_FALSE;
+        }
+    } else {
+        return shl_ref_matmul_quant(mat0, mat1, output, params);
+    }
+
+    return CSINN_TRUE;
+}
+
+void shl_rvv_matmul_reorder_weight_int8(struct csinn_tensor *mat1)
+{
+    int8_t *mat1_data = (int8_t *)mat1->data;
+    int dims_count = mat1->dim_count;
+    int batch = 1;
+    for (int i = 0; i < dims_count - 2; i++) {
+        batch *= mat1->dim[i];
+    }
+    const int k = mat1->dim[dims_count - 2];
+    const int n = mat1->dim[dims_count - 1];
+    int8_t *mat_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
+
+    for (int b = 0; b < batch; b++) {
+        int8_t *init_mat = mat1_data + b * k * n;
+#ifdef SHL_USE_DOT_INT8
+        shl_rvv_matmul_reorder_mat1_zmf2n4_int8_dot(init_mat, mat_reorder, k, n, n);
+#else
+        shl_rvv_matmul_reorder_mat1_zpackn_int8(init_mat, mat_reorder, k, n, n);
+#endif  // SHL_USE_DOT_INT8
+        memcpy(init_mat, mat_reorder, k * n * sizeof(int8_t));
+    }
+
+    shl_mem_free(mat_reorder);
+}
+
+int shl_rvv_matmul_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                        struct csinn_tensor *output, struct csinn_matmul_params *params)
+{
+#ifdef SHL_USE_DOT_INT8
+    return shl_rvv_matmul_common_int8(
+        mat0, mat1, output, params, shl_rvv_matmul_reorder_mat0_n8z4_int8_dot,
+        shl_rvv_matmul_reorder_mat1_zmf2n4_int8_dot, shl_rvv_matmul_8xmf2_int8_dot);
+#else
+    return shl_rvv_matmul_common_int8(
+        mat0, mat1, output, params, shl_rvv_matmul_reorder_mat0_n4_int8,
+        shl_rvv_matmul_reorder_mat1_zpackn_int8, shl_rvv_matmul_4xpackn_int8);
+#endif  // SHL_USE_DOT_INT8
+}
+
+int shl_rvv_matmul_init_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                             struct csinn_tensor *output, struct csinn_matmul_params *params)
+{
+    struct csinn_callback *cb = params->base.cb;
+    if (!params->trans_a && !params->trans_b) {
+        if (mat0->dtype == CSINN_DTYPE_INT8 && mat1->dtype == CSINN_DTYPE_INT8) {
+            if (mat1->is_const) {
+                shl_rvv_matmul_reorder_weight_int8(mat1);
+            }
+            cb->exec = shl_rvv_matmul_int8;
+        }
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "matmul is not optimized to achieve under this condition, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_matmul_quant;
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/int8/matmul_int8.c b/source/thead_rvv/int8/matmul_int8.c
index dbc9cfb6..2d83710c 100644
--- a/source/thead_rvv/int8/matmul_int8.c
+++ b/source/thead_rvv/int8/matmul_int8.c
@@ -16,183 +16,204 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
-/************************************************************************************
- * [m, k] --> [k, m]
- ************************************************************************************/
-static inline void transpose_mat(int8_t *mat, int m, int k)
+/*************************************************************
+ * src: [m, k]
+ * dst: [m/4, k, 4]
+ ************************************************************/
+void shl_rvv_matmul_reorder_mat0_n4_int8(int8_t *src, int8_t *dst, int m, int k, int lda)
 {
-    int8_t *trans = (int8_t *)shl_mem_alloc(k * m * sizeof(int8_t));
-    for (int i = 0; i < k; i++) {
-        int j = 0;
-        while (j < m) {
-            int vl = vsetvl_e8m1(m - j);
-            vint8m1_t _src = vlse8_v_i8m1(mat + j * k + i, k, vl);
-            vse8_v_i8m1(trans + i * m + j, _src, vl);
-            j += vl;
+    int i = 0;
+    for (; i + 3 < m; i += 4) {
+        int8_t *s_ptr = src + i * lda;
+        int8_t *d_ptr = dst + i * k;
+        int stride = 4 * sizeof(int8_t);
+        int c = 0;
+        while (c < k) {
+            int vl = vsetvl_e8m4(k - c);
+            vint8m4_t _s0 = vle8_v_i8m4(s_ptr, vl);
+            vint8m4_t _s1 = vle8_v_i8m4(s_ptr + lda, vl);
+            vint8m4_t _s2 = vle8_v_i8m4(s_ptr + lda * 2, vl);
+            vint8m4_t _s3 = vle8_v_i8m4(s_ptr + lda * 3, vl);
+            vsse8_v_i8m4(d_ptr, stride, _s0, vl);
+            vsse8_v_i8m4(d_ptr + 1, stride, _s1, vl);
+            vsse8_v_i8m4(d_ptr + 2, stride, _s2, vl);
+            vsse8_v_i8m4(d_ptr + 3, stride, _s3, vl);
+            s_ptr += vl;
+            d_ptr += vl * 4;
+            c += vl;
+        }
+    }
+    for (; i + 1 < m; i += 2) {
+        int8_t *s_ptr = src + i * lda;
+        int8_t *d_ptr = dst + i * k;
+        int stride = 2 * sizeof(int8_t);
+        int c = 0;
+        while (c < k) {
+            int vl = vsetvl_e8m4(k - c);
+            vint8m4_t _s0 = vle8_v_i8m4(s_ptr, vl);
+            vint8m4_t _s1 = vle8_v_i8m4(s_ptr + lda, vl);
+            vsse8_v_i8m4(d_ptr, stride, _s0, vl);
+            vsse8_v_i8m4(d_ptr + 1, stride, _s1, vl);
+            s_ptr += vl;
+            d_ptr += vl * 2;
+            c += vl;
+        }
+    }
+    for (; i < m; i++) {
+        int8_t *s_ptr = src + i * lda;
+        int8_t *d_ptr = dst + i * k;
+        int c = 0;
+        while (c < k) {
+            int vl = vsetvl_e8m4(k - c);
+            vint8m4_t _src = vle8_v_i8m4(s_ptr, vl);
+            vse8_v_i8m4(d_ptr, _src, vl);
+            s_ptr += vl;
+            d_ptr += vl;
+            c += vl;
         }
     }
-    memcpy(mat, trans, m * k * sizeof(int8_t));
-    shl_mem_free(trans);
 }
 
-/************************************************************************************
- * trans_a = 0
- * trans_b = 0
- * mat0:   [dim_i, dim_k]
- * mat1:   [dim_k, dim_j]
- * output: [dim_i, dim_j]
- ************************************************************************************/
-static void matmul_int8_axb(int8_t *output, const int8_t *mat0, const int8_t *mat1, int dim_i,
-                            int dim_k, int dim_j, int32_t z1, int32_t z2, int32_t z3, int32_t mult,
-                            int32_t shift)
+/*************************************************************
+ * src: [k, n]
+ * dst: [n/packn, k, packn]
+ ************************************************************/
+void shl_rvv_matmul_reorder_mat1_zpackn_int8(int8_t *src, int8_t *dst, int k, int n, int ldb)
 {
-    for (int i = 0; i < dim_i; i++) {
-        int j = 0;
-        while (j < dim_j) {
-            int vl = vsetvl_e8m1(dim_j - j);
-            const int8_t *m0_ptr = mat0;
-            const int8_t *m1_ptr = mat1 + j;
-            vint32m4_t _acc = vmv_v_x_i32m4(0, vl);
-
-            for (int k = 0; k < dim_k; k++) {
-                vint8m1_t _m1 = vle8_v_i8m1(m1_ptr, vl);
-                vint16m2_t _m1_w = vwsub_vx_i16m2(_m1, z2, vl);
-                int16_t m0_w = m0_ptr[0] - z1;
-                vint32m4_t _mul = vwmul_vx_i32m4(_m1_w, m0_w, vl);
-                _acc = vadd_vv_i32m4(_acc, _mul, vl);
-                m0_ptr += 1;
-                m1_ptr += dim_j;
-            }
-
-            vint32m4_t _mulh = vmulh_vx_i32m4(_acc, mult, vl);
-            if (shift < 0) {
-                _mulh = vssra_vx_i32m4(_mulh, -shift - 1, vl);
-            } else {
-                _mulh = vsll_vx_i32m4(_mulh, shift + 1, vl);
-            }
-            vint32m4_t _res0 = vadd_vx_i32m4(_mulh, z3, vl);
-            vint16m2_t _res1 = vnclip_wx_i16m2(_res0, 0, vl);
-            vint8m1_t _res2 = vnclip_wx_i8m1(_res1, 0, vl);
-            vse8_v_i8m1(output, _res2, vl);
-            output += vl;
-            j += vl;
+    int j = 0;
+    while (j < n) {
+        int vl = vsetvl_e8m1(n - j);
+        int8_t *s_ptr = src + j;
+        for (int c = 0; c < k; c++) {
+            vint8m1_t _src = vle8_v_i8m1(s_ptr, vl);
+            vse8_v_i8m1(dst, _src, vl);
+            s_ptr += ldb;
+            dst += vl;
         }
-        mat0 += dim_k;
+        j += vl;
     }
 }
 
-/************************************************************************************
- * trans_a = 0
- * trans_b = 1
- * mat0:   [dim_i, dim_k]
- * mat1:   [dim_j, dim_k]
- * output: [dim_i, dim_j]
- ************************************************************************************/
-static void matmul_int8_axtb(int8_t *output, const int8_t *mat0, const int8_t *mat1, int dim_i,
-                             int dim_k, int dim_j, int32_t z1, int32_t z2, int32_t z3, int32_t mult,
-                             int32_t shift)
+static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
+                               int vl)
+{
+    vint32m4_t _mulh = vmulh_vx_i32m4(_src, multiplier, vl);
+    _mulh = vssra_vx_i32m4(_mulh, -shift - 1, vl);
+    _mulh = vadd_vx_i32m4(_mulh, out_zp, vl);
+    vint16m2_t _tmp1 = vnclip_wx_i16m2(_mulh, 0, vl);
+    vint8m1_t _tmp2 = vnclip_wx_i8m1(_tmp1, 0, vl);
+    return _tmp2;
+}
+
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t)
+ * dst - output: [m, n]
+ * sa - mat0:    [m/4, k, 4]
+ * sb - mat1:    [n/packn, k, packn]
+ ************************************************************/
+void shl_rvv_matmul_4xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
+                                 int n, int ldc, int32_t z1, int32_t z2, int32_t z3, int32_t mult,
+                                 int32_t shift)
 {
-    for (int i = 0; i < dim_i; i++) {
-        const int8_t *m1_ptr = mat1;
+    const int8_t *kernel_data = sa;
+    const int8_t *input_data = sb;
+    int8_t *output_data = dst;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t);
+    int vl = vsetvl_e8m1(packn);
+
+    int i = 0;
+    for (; i + 3 < m; i += 4) {
+        const int8_t *kernel_ptr = kernel_data + i * k;
         int j = 0;
-        while (j < dim_j) {
-            int vl = vsetvl_e8m1(dim_j - j);
-            const int8_t *m0_ptr = mat0;
-            vint32m4_t _acc = vmv_v_x_i32m4(0, vl);
-
-            for (int k = 0; k < dim_k; k++) {
-                vint8m1_t _m1 = vlse8_v_i8m1(m1_ptr + j * dim_k + k, dim_k, vl);
-                vint16m2_t _m1_w = vwsub_vx_i16m2(_m1, z2, vl);
-                int16_t m0_w = m0_ptr[0] - z1;
-                vint32m4_t _mul = vwmul_vx_i32m4(_m1_w, m0_w, vl);
-                _acc = vadd_vv_i32m4(_acc, _mul, vl);
-                m0_ptr += 1;
-            }
+        while (j < n) {
+            vl = vsetvl_e8m1(n - j);
+            const int8_t *k_ptr = kernel_ptr;
+            const int8_t *in_ptr = input_data + j * k;
+            int8_t *out_ptr = output_data + i * ldc + j;
+
+            vint32m4_t _acc0 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _acc1 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _acc2 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _acc3 = vmv_v_x_i32m4(0, vl);
 
-            vint32m4_t _mulh = vmulh_vx_i32m4(_acc, mult, vl);
-            if (shift < 0) {
-                _mulh = vssra_vx_i32m4(_mulh, -shift - 1, vl);
-            } else {
-                _mulh = vsll_vx_i32m4(_mulh, shift + 1, vl);
+            for (int c = 0; c < k; c++) {
+                vint8m1_t _in = vle8_v_i8m1(in_ptr, vl);
+                vint16m2_t _in_w = vwsub_vx_i16m2(_in, z2, vl);
+                in_ptr += vl;
+
+                _acc0 = vwmacc_vx_i32m4(_acc0, k_ptr[0] - z1, _in_w, vl);
+                _acc1 = vwmacc_vx_i32m4(_acc1, k_ptr[1] - z1, _in_w, vl);
+                _acc2 = vwmacc_vx_i32m4(_acc2, k_ptr[2] - z1, _in_w, vl);
+                _acc3 = vwmacc_vx_i32m4(_acc3, k_ptr[3] - z1, _in_w, vl);
+                k_ptr += 4;
             }
-            vint32m4_t _res0 = vadd_vx_i32m4(_mulh, z3, vl);
-            vint16m2_t _res1 = vnclip_wx_i16m2(_res0, 0, vl);
-            vint8m1_t _res2 = vnclip_wx_i8m1(_res1, 0, vl);
-            vse8_v_i8m1(output, _res2, vl);
-            output += vl;
+
+            vint8m1_t _res0 = requantize_m4(_acc0, mult, shift, z3, vl);
+            vint8m1_t _res1 = requantize_m4(_acc1, mult, shift, z3, vl);
+            vint8m1_t _res2 = requantize_m4(_acc2, mult, shift, z3, vl);
+            vint8m1_t _res3 = requantize_m4(_acc3, mult, shift, z3, vl);
+            vse8_v_i8m1(out_ptr, _res0, vl);
+            vse8_v_i8m1(out_ptr + ldc, _res1, vl);
+            vse8_v_i8m1(out_ptr + ldc * 2, _res2, vl);
+            vse8_v_i8m1(out_ptr + ldc * 3, _res3, vl);
             j += vl;
         }
-        mat0 += dim_k;
     }
-}
+    for (; i + 1 < m; i += 2) {
+        const int8_t *kernel_ptr = kernel_data + i * k;
+        int j = 0;
+        while (j < n) {
+            vl = vsetvl_e8m1(n - j);
+            const int8_t *k_ptr = kernel_ptr;
+            const int8_t *in_ptr = input_data + j * k;
+            int8_t *out_ptr = output_data + i * ldc + j;
 
-int shl_rvv_matmul_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
-                        struct csinn_tensor *output, struct csinn_matmul_params *params)
-{
-    int8_t *mat0_data = mat0->data;
-    int8_t *mat1_data = mat1->data;
-    int8_t *output_data = output->data;
-    const int dims_count = mat0->dim_count;
-    int batches_a = 1;
-    int batches_b = 1;
-
-    /* compute the outer size */
-    for (int i = 0; i < dims_count - 2; i++) {
-        batches_a *= mat0->dim[i];
-        batches_b *= mat1->dim[i];
-    }
+            vint32m4_t _acc0 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _acc1 = vmv_v_x_i32m4(0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vint8m1_t _in = vle8_v_i8m1(in_ptr, vl);
+                vint16m2_t _in_w = vwsub_vx_i16m2(_in, z2, vl);
+                in_ptr += vl;
 
-    const int dim_i = mat0->dim[dims_count - (params->trans_a ? 1 : 2)];
-    const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)];
-    const int dim_j = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
-
-    int32_t z1 = mat0->qinfo->zero_point;
-    int32_t z2 = mat1->qinfo->zero_point;
-    int32_t z3 = output->qinfo->zero_point;
-    int32_t multiplier;
-    int32_t shift;
-    float real_scale = mat0->qinfo->scale * mat1->qinfo->scale / output->qinfo->scale;
-    shl_quantize_multiplier(real_scale, &multiplier, &shift);
-
-    if (batches_a == batches_b) {
-        for (int b = 0; b < batches_a; b++) {
-            if (!params->trans_a && !params->trans_b) {
-                matmul_int8_axb(output_data, mat0_data, mat1_data, dim_i, dim_k, dim_j, z1, z2, z3,
-                                multiplier, shift);
-            } else if (!params->trans_a && params->trans_b) {
-                matmul_int8_axtb(output_data, mat0_data, mat1_data, dim_i, dim_k, dim_j, z1, z2, z3,
-                                 multiplier, shift);
-            } else if (params->trans_a && !params->trans_b) {
-                transpose_mat(mat0_data, dim_k, dim_i);
-                matmul_int8_axb(output_data, mat0_data, mat1_data, dim_i, dim_k, dim_j, z1, z2, z3,
-                                multiplier, shift);
-            } else {
-                matmul_int8_axb(output_data, mat1_data, mat0_data, dim_j, dim_k, dim_i, z2, z1, z3,
-                                multiplier, shift);
-                transpose_mat(output_data, dim_j, dim_i);
+                _acc0 = vwmacc_vx_i32m4(_acc0, k_ptr[0] - z1, _in_w, vl);
+                _acc1 = vwmacc_vx_i32m4(_acc1, k_ptr[1] - z1, _in_w, vl);
+                k_ptr += 2;
             }
-            mat0_data += dim_i * dim_k;
-            mat1_data += dim_k * dim_j;
-            output_data += dim_i * dim_j;
+
+            vint8m1_t _res0 = requantize_m4(_acc0, mult, shift, z3, vl);
+            vint8m1_t _res1 = requantize_m4(_acc1, mult, shift, z3, vl);
+            vse8_v_i8m1(out_ptr, _res0, vl);
+            vse8_v_i8m1(out_ptr + ldc, _res1, vl);
+            j += vl;
         }
-    } else if (batches_a > 1 && batches_b == 1) {
-        for (int b = 0; b < batches_a; b++) {
-            if (!params->trans_a && !params->trans_b) {
-                matmul_int8_axb(output_data, mat0_data, mat1_data, dim_i, dim_k, dim_j, z1, z2, z3,
-                                multiplier, shift);
-            } else {
-                shl_debug_error("matmul unsupport this broadcast\n");
-                return CSINN_FALSE;
+    }
+    for (; i < m; i++) {
+        const int8_t *kernel_ptr = kernel_data + i * k;
+        int j = 0;
+        while (j < n) {
+            vl = vsetvl_e8m1(n - j);
+            const int8_t *k_ptr = kernel_ptr;
+            const int8_t *in_ptr = input_data + j * k;
+            int8_t *out_ptr = output_data + i * ldc + j;
+
+            vint32m4_t _acc0 = vmv_v_x_i32m4(0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vint8m1_t _in = vle8_v_i8m1(in_ptr, vl);
+                vint16m2_t _in_w = vwsub_vx_i16m2(_in, z2, vl);
+                in_ptr += vl;
+
+                _acc0 = vwmacc_vx_i32m4(_acc0, k_ptr[0] - z1, _in_w, vl);
+                k_ptr += 1;
             }
-            mat0_data += dim_i * dim_k;
-            output_data += dim_i * dim_j;
+
+            vint8m1_t _res0 = requantize_m4(_acc0, mult, shift, z3, vl);
+            vse8_v_i8m1(out_ptr, _res0, vl);
+            j += vl;
         }
-    } else {
-        shl_debug_error("matmul unsupport this broadcast\n");
-        return CSINN_FALSE;
     }
-
-    return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/int8/matmul_int8_dot.c b/source/thead_rvv/int8/matmul_int8_dot.c
new file mode 100644
index 00000000..6fe83c89
--- /dev/null
+++ b/source/thead_rvv/int8/matmul_int8_dot.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+#ifdef SHL_USE_DOT_INT8
+/*************************************************************
+ * src: [m, k]
+ * dst:
+ *   k % 4 == 0: [m/8, k/4, 8, 4]
+ *   k_tail    : [m/8, k_tail, 8]
+ ************************************************************/
+void shl_rvv_matmul_reorder_mat0_n8z4_int8_dot(int8_t *src, int8_t *dst, int m, int k, int lda)
+{
+    int i = 0;
+    for (; i + 7 < m; i += 8) {
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            int8_t *s_ptr = src + j;
+            for (int c = 0; c < 8; c++) {
+                vint8m1_t _src = vle8_v_i8m1(s_ptr, 4);
+                s_ptr += lda;
+                vse8_v_i8m1(dst, _src, 4);
+                dst += 4;
+            }
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *s_ptr = src + j;
+            int stride = 8 * sizeof(int8_t);
+            int vl = vsetvl_e8m1(k - j);
+            vint8m1_t _s0 = vle8_v_i8m1(s_ptr, vl);
+            vint8m1_t _s1 = vle8_v_i8m1(s_ptr + lda, vl);
+            vint8m1_t _s2 = vle8_v_i8m1(s_ptr + lda * 2, vl);
+            vint8m1_t _s3 = vle8_v_i8m1(s_ptr + lda * 3, vl);
+            vint8m1_t _s4 = vle8_v_i8m1(s_ptr + lda * 4, vl);
+            vint8m1_t _s5 = vle8_v_i8m1(s_ptr + lda * 5, vl);
+            vint8m1_t _s6 = vle8_v_i8m1(s_ptr + lda * 6, vl);
+            vint8m1_t _s7 = vle8_v_i8m1(s_ptr + lda * 7, vl);
+            vsse8_v_i8m1(dst, stride, _s0, vl);
+            vsse8_v_i8m1(dst + 1, stride, _s1, vl);
+            vsse8_v_i8m1(dst + 2, stride, _s2, vl);
+            vsse8_v_i8m1(dst + 3, stride, _s3, vl);
+            vsse8_v_i8m1(dst + 4, stride, _s4, vl);
+            vsse8_v_i8m1(dst + 5, stride, _s5, vl);
+            vsse8_v_i8m1(dst + 6, stride, _s6, vl);
+            vsse8_v_i8m1(dst + 7, stride, _s7, vl);
+            s_ptr += vl;
+            dst += vl * 8;
+            j += vl;
+        }
+        src += 8 * k;
+    }
+    for (; i + 3 < m; i += 4) {
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            int8_t *s_ptr = src + j;
+            for (int c = 0; c < 4; c++) {
+                vint8m1_t _src = vle8_v_i8m1(s_ptr, 4);
+                s_ptr += lda;
+                vse8_v_i8m1(dst, _src, 4);
+                dst += 4;
+            }
+        }
+        if (j < k) {
+            int8_t *s_ptr = src + j;
+            int stride = 4 * sizeof(int8_t);
+            int vl = vsetvl_e8m1(k - j);
+            vint8m1_t _s0 = vle8_v_i8m1(s_ptr, vl);
+            vint8m1_t _s1 = vle8_v_i8m1(s_ptr + lda, vl);
+            vint8m1_t _s2 = vle8_v_i8m1(s_ptr + lda * 2, vl);
+            vint8m1_t _s3 = vle8_v_i8m1(s_ptr + lda * 3, vl);
+            vsse8_v_i8m1(dst, stride, _s0, vl);
+            vsse8_v_i8m1(dst + 1, stride, _s1, vl);
+            vsse8_v_i8m1(dst + 2, stride, _s2, vl);
+            vsse8_v_i8m1(dst + 3, stride, _s3, vl);
+            s_ptr += vl;
+            dst += vl * 4;
+            j += vl;
+        }
+        src += 4 * k;
+    }
+    for (; i + 1 < m; i += 2) {
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            int8_t *s_ptr = src + j;
+            for (int c = 0; c < 2; c++) {
+                vint8m1_t _src = vle8_v_i8m1(s_ptr, 4);
+                s_ptr += lda;
+                vse8_v_i8m1(dst, _src, 4);
+                dst += 4;
+            }
+        }
+        if (j < k) {
+            int8_t *s_ptr = src + j;
+            int stride = 2 * sizeof(int8_t);
+            int vl = vsetvl_e8m1(k - j);
+            vint8m1_t _s0 = vle8_v_i8m1(s_ptr, vl);
+            vint8m1_t _s1 = vle8_v_i8m1(s_ptr + lda, vl);
+            vsse8_v_i8m1(dst, stride, _s0, vl);
+            vsse8_v_i8m1(dst + 1, stride, _s1, vl);
+            s_ptr += vl;
+            dst += vl * 2;
+            j += vl;
+        }
+        src += 2 * k;
+    }
+    for (; i < m; i++) {
+        memcpy(dst, src, k * sizeof(int8_t));
+    }
+}
+
+/*************************************************************
+ * mf2 = vlenb / sizeof(int8_t) / 2
+ * src: [k, n]
+ * dst:
+ *   k % 4 == 0: [n/mf2, k/4, mf2, 4]
+ *   k_tail    : [n/mf2, k_tail, mf2]
+ ************************************************************/
+void shl_rvv_matmul_reorder_mat1_zmf2n4_int8_dot(int8_t *src, int8_t *dst, int k, int n, int ldb)
+{
+    int j = 0;
+    while (j < n) {
+        int vl = vsetvl_e8mf2(n - j);
+        int8_t *s_ptr = src + j;
+        int c = 0;
+        for (; c + 3 < k; c += 4) {
+            for (int i = 0; i < 4; i++) {
+                vint8mf2_t _src = vle8_v_i8mf2(s_ptr, vl);
+                s_ptr += ldb;
+                vsse8_v_i8mf2(dst + i, 4 * sizeof(int8_t), _src, vl);
+            }
+            dst += 4 * vl;
+        }
+        // k_tail
+        for (; c < k; c++) {
+            vint8m1_t _src = vle8_v_i8m1(s_ptr, vl);
+            vse8_v_i8m1(dst, _src, vl);
+            s_ptr += ldb;
+            dst += vl;
+        }
+        j += vl;
+    }
+}
+
+static vint8mf2_t requantize_m2(vint32m2_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
+                                int vl)
+{
+    vint32m2_t _mulh = vmulh_vx_i32m2(_src, multiplier, vl);
+    _mulh = vssra_vx_i32m2(_mulh, -shift - 1, vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    return _tmp2;
+}
+
+/*************************************************************
+ * mf2 = vlenb / sizeof(int8_t) / 2
+ * dst - output: [m, n]
+ * sa - mat0
+ *   k % 4 == 0: [m/8, k/4, 8, 4]
+ *   k_tail    : [m/8, k_tail, 8]
+ * sb - mat1
+ *   k % 4 == 0: [n/mf2, k/4, mf2, 4]
+ *   k_tail    : [n/mf2, k_tail, mf2]
+ ************************************************************/
+void shl_rvv_matmul_8xmf2_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
+                                   int n, int ldc, int32_t z1, int32_t z2, int32_t z3, int32_t mult,
+                                   int32_t shift)
+{
+    const int8_t *kernel_data = sa;
+    const int8_t *input_data = sb;
+    int8_t *output_data = dst;
+
+    const int m2 = csrr_vlenb() / sizeof(int8_t) * 2;
+    int8_t z1_i8 = (int8_t)-z1;
+    int8_t z1_i8_4[4] = {z1_i8, z1_i8, z1_i8, z1_i8};
+    int32_t *z1_i32 = (int32_t *)z1_i8_4;
+    vint8m2_t _z2_i8 = vmv_v_x_i8m2((int8_t)-z2, m2);
+    int32_t z1z2 = z1 * z2;
+
+    int i = 0;
+    for (; i + 7 < m; i += 8) {
+        const int8_t *kernel_ptr = kernel_data + i * k;
+        int j = 0;
+        while (j < n) {
+            int vl = vsetvl_e8mf2(n - j);
+            const int32_t *k32_ptr = (int32_t *)kernel_ptr;
+            const int8_t *in_ptr = input_data + j * k;
+            int8_t *out_ptr = output_data + i * ldc + j;
+
+            vint32m2_t _acc0 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc1 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc2 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc3 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc4 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc5 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc6 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc7 = vmv_v_x_i32m2(0, vl);
+
+            int c = 0;
+            for (; c + 3 < k; c += 4) {
+                vint8m2_t _in = vle8_v_i8m2(in_ptr, vl * 4);
+                in_ptr += vl * 4;
+
+                // q1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _in, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, k32_ptr[1], _in, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, k32_ptr[2], _in, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, k32_ptr[3], _in, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, k32_ptr[4], _in, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, k32_ptr[5], _in, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, k32_ptr[6], _in, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, k32_ptr[7], _in, vl);
+                // - z1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, z1_i32[0], _in, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, z1_i32[0], _in, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, z1_i32[0], _in, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, z1_i32[0], _in, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, z1_i32[0], _in, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, z1_i32[0], _in, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, z1_i32[0], _in, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, z1_i32[0], _in, vl);
+                // - z2 * q1
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _z2_i8, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, k32_ptr[1], _z2_i8, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, k32_ptr[2], _z2_i8, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, k32_ptr[3], _z2_i8, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, k32_ptr[4], _z2_i8, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, k32_ptr[5], _z2_i8, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, k32_ptr[6], _z2_i8, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, k32_ptr[7], _z2_i8, vl);
+                k32_ptr += 8;
+            }
+            // + z1 * z2
+            int32_t acc_z1z2 = c * z1z2;
+            _acc0 = vadd_vx_i32m2(_acc0, acc_z1z2, vl);
+            _acc1 = vadd_vx_i32m2(_acc1, acc_z1z2, vl);
+            _acc2 = vadd_vx_i32m2(_acc2, acc_z1z2, vl);
+            _acc3 = vadd_vx_i32m2(_acc3, acc_z1z2, vl);
+            _acc4 = vadd_vx_i32m2(_acc4, acc_z1z2, vl);
+            _acc5 = vadd_vx_i32m2(_acc5, acc_z1z2, vl);
+            _acc6 = vadd_vx_i32m2(_acc6, acc_z1z2, vl);
+            _acc7 = vadd_vx_i32m2(_acc7, acc_z1z2, vl);
+
+            const int8_t *k_ptr = kernel_ptr + 8 * c;
+            for (; c < k; c++) {
+                vint8mf2_t _in = vle8_v_i8mf2(in_ptr, vl);
+                vint16m1_t _in_w = vwsub_vx_i16m1(_in, z2, vl);
+                in_ptr += vl;
+
+                _acc0 = vwmacc_vx_i32m2(_acc0, k_ptr[0] - z1, _in_w, vl);
+                _acc1 = vwmacc_vx_i32m2(_acc1, k_ptr[1] - z1, _in_w, vl);
+                _acc2 = vwmacc_vx_i32m2(_acc2, k_ptr[2] - z1, _in_w, vl);
+                _acc3 = vwmacc_vx_i32m2(_acc3, k_ptr[3] - z1, _in_w, vl);
+                _acc4 = vwmacc_vx_i32m2(_acc4, k_ptr[4] - z1, _in_w, vl);
+                _acc5 = vwmacc_vx_i32m2(_acc5, k_ptr[5] - z1, _in_w, vl);
+                _acc6 = vwmacc_vx_i32m2(_acc6, k_ptr[6] - z1, _in_w, vl);
+                _acc7 = vwmacc_vx_i32m2(_acc7, k_ptr[7] - z1, _in_w, vl);
+                k_ptr += 8;
+            }
+
+            vint8mf2_t _res0 = requantize_m2(_acc0, mult, shift, z3, vl);
+            vint8mf2_t _res1 = requantize_m2(_acc1, mult, shift, z3, vl);
+            vint8mf2_t _res2 = requantize_m2(_acc2, mult, shift, z3, vl);
+            vint8mf2_t _res3 = requantize_m2(_acc3, mult, shift, z3, vl);
+            vint8mf2_t _res4 = requantize_m2(_acc4, mult, shift, z3, vl);
+            vint8mf2_t _res5 = requantize_m2(_acc5, mult, shift, z3, vl);
+            vint8mf2_t _res6 = requantize_m2(_acc6, mult, shift, z3, vl);
+            vint8mf2_t _res7 = requantize_m2(_acc7, mult, shift, z3, vl);
+
+            vse8_v_i8mf2(out_ptr, _res0, vl);
+            vse8_v_i8mf2(out_ptr + ldc, _res1, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 2, _res2, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 3, _res3, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 4, _res4, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 5, _res5, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 6, _res6, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 7, _res7, vl);
+            j += vl;
+        }
+    }
+    for (; i + 3 < m; i += 4) {
+        const int8_t *kernel_ptr = kernel_data + i * k;
+        int j = 0;
+        while (j < n) {
+            int vl = vsetvl_e8mf2(n - j);
+            const int32_t *k32_ptr = (int32_t *)kernel_ptr;
+            const int8_t *in_ptr = input_data + j * k;
+            int8_t *out_ptr = output_data + i * ldc + j;
+
+            vint32m2_t _acc0 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc1 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc2 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc3 = vmv_v_x_i32m2(0, vl);
+
+            int c = 0;
+            for (; c + 3 < k; c += 4) {
+                vint8m2_t _in = vle8_v_i8m2(in_ptr, vl * 4);
+                in_ptr += vl * 4;
+
+                // q1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _in, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, k32_ptr[1], _in, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, k32_ptr[2], _in, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, k32_ptr[3], _in, vl);
+                // - z1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, z1_i32[0], _in, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, z1_i32[0], _in, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, z1_i32[0], _in, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, z1_i32[0], _in, vl);
+                // - z2 * q1
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _z2_i8, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, k32_ptr[1], _z2_i8, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, k32_ptr[2], _z2_i8, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, k32_ptr[3], _z2_i8, vl);
+                k32_ptr += 4;
+            }
+            // + z1 * z2
+            int32_t acc_z1z2 = c * z1z2;
+            _acc0 = vadd_vx_i32m2(_acc0, acc_z1z2, vl);
+            _acc1 = vadd_vx_i32m2(_acc1, acc_z1z2, vl);
+            _acc2 = vadd_vx_i32m2(_acc2, acc_z1z2, vl);
+            _acc3 = vadd_vx_i32m2(_acc3, acc_z1z2, vl);
+
+            const int8_t *k_ptr = kernel_ptr + 4 * c;
+            for (; c < k; c++) {
+                vint8mf2_t _in = vle8_v_i8mf2(in_ptr, vl);
+                vint16m1_t _in_w = vwsub_vx_i16m1(_in, z2, vl);
+                in_ptr += vl;
+
+                _acc0 = vwmacc_vx_i32m2(_acc0, k_ptr[0] - z1, _in_w, vl);
+                _acc1 = vwmacc_vx_i32m2(_acc1, k_ptr[1] - z1, _in_w, vl);
+                _acc2 = vwmacc_vx_i32m2(_acc2, k_ptr[2] - z1, _in_w, vl);
+                _acc3 = vwmacc_vx_i32m2(_acc3, k_ptr[3] - z1, _in_w, vl);
+                k_ptr += 4;
+            }
+
+            vint8mf2_t _res0 = requantize_m2(_acc0, mult, shift, z3, vl);
+            vint8mf2_t _res1 = requantize_m2(_acc1, mult, shift, z3, vl);
+            vint8mf2_t _res2 = requantize_m2(_acc2, mult, shift, z3, vl);
+            vint8mf2_t _res3 = requantize_m2(_acc3, mult, shift, z3, vl);
+
+            vse8_v_i8mf2(out_ptr, _res0, vl);
+            vse8_v_i8mf2(out_ptr + ldc, _res1, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 2, _res2, vl);
+            vse8_v_i8mf2(out_ptr + ldc * 3, _res3, vl);
+            j += vl;
+        }
+    }
+    for (; i + 1 < m; i += 2) {
+        const int8_t *kernel_ptr = kernel_data + i * k;
+        int j = 0;
+        while (j < n) {
+            int vl = vsetvl_e8mf2(n - j);
+            const int32_t *k32_ptr = (int32_t *)kernel_ptr;
+            const int8_t *in_ptr = input_data + j * k;
+            int8_t *out_ptr = output_data + i * ldc + j;
+
+            vint32m2_t _acc0 = vmv_v_x_i32m2(0, vl);
+            vint32m2_t _acc1 = vmv_v_x_i32m2(0, vl);
+
+            int c = 0;
+            for (; c + 3 < k; c += 4) {
+                vint8m2_t _in = vle8_v_i8m2(in_ptr, vl * 4);
+                in_ptr += vl * 4;
+
+                // q1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _in, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, k32_ptr[1], _in, vl);
+                // - z1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, z1_i32[0], _in, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, z1_i32[0], _in, vl);
+                // z1_i8_4[3]);
+                // - z2 * q1
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _z2_i8, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, k32_ptr[1], _z2_i8, vl);
+                k32_ptr += 2;
+            }
+            // + z1 * z2
+            int32_t acc_z1z2 = c * z1z2;
+            _acc0 = vadd_vx_i32m2(_acc0, acc_z1z2, vl);
+            _acc1 = vadd_vx_i32m2(_acc1, acc_z1z2, vl);
+
+            const int8_t *k_ptr = kernel_ptr + 2 * c;
+            for (; c < k; c++) {
+                vint8mf2_t _in = vle8_v_i8mf2(in_ptr, vl);
+                vint16m1_t _in_w = vwsub_vx_i16m1(_in, z2, vl);
+                in_ptr += vl;
+
+                _acc0 = vwmacc_vx_i32m2(_acc0, k_ptr[0] - z1, _in_w, vl);
+                _acc1 = vwmacc_vx_i32m2(_acc1, k_ptr[1] - z1, _in_w, vl);
+                k_ptr += 2;
+            }
+
+            vint8mf2_t _res0 = requantize_m2(_acc0, mult, shift, z3, vl);
+            vint8mf2_t _res1 = requantize_m2(_acc1, mult, shift, z3, vl);
+
+            vse8_v_i8mf2(out_ptr, _res0, vl);
+            vse8_v_i8mf2(out_ptr + ldc, _res1, vl);
+            j += vl;
+        }
+    }
+    for (; i < m; i++) {
+        const int8_t *kernel_ptr = kernel_data + i * k;
+        int j = 0;
+        while (j < n) {
+            int vl = vsetvl_e8mf2(n - j);
+            const int32_t *k32_ptr = (int32_t *)kernel_ptr;
+            const int8_t *in_ptr = input_data + j * k;
+            int8_t *out_ptr = output_data + i * ldc + j;
+
+            vint32m2_t _acc0 = vmv_v_x_i32m2(0, vl);
+
+            int c = 0;
+            for (; c + 3 < k; c += 4) {
+                vint8m2_t _in = vle8_v_i8m2(in_ptr, vl * 4);
+                in_ptr += vl * 4;
+
+                int8_t *tmp_k_ptr = (int8_t *)k32_ptr;
+
+                // q1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _in, vl);
+                // - z1 * q2
+                _acc0 = vmaqa_vx_i32m2(_acc0, z1_i32[0], _in, vl);
+                // - z2 * q1
+                _acc0 = vmaqa_vx_i32m2(_acc0, k32_ptr[0], _z2_i8, vl);
+                k32_ptr += 1;
+            }
+            // + z1 * z2
+            int32_t acc_z1z2 = c * z1z2;
+            _acc0 = vadd_vx_i32m2(_acc0, acc_z1z2, vl);
+
+            const int8_t *k_ptr = kernel_ptr + 1 * c;
+            for (; c < k; c++) {
+                vint8mf2_t _in = vle8_v_i8mf2(in_ptr, vl);
+                vint16m1_t _in_w = vwsub_vx_i16m1(_in, z2, vl);
+                in_ptr += vl;
+
+                _acc0 = vwmacc_vx_i32m2(_acc0, k_ptr[0] - z1, _in_w, vl);
+                k_ptr += 1;
+            }
+
+            vint8mf2_t _res0 = requantize_m2(_acc0, mult, shift, z3, vl);
+
+            vse8_v_i8mf2(out_ptr, _res0, vl);
+            j += vl;
+        }
+    }
+}
+#endif
diff --git a/source/thead_rvv/int8/maxpool.c b/source/thead_rvv/int8/maxpool.c
index cb761efe..71d71625 100644
--- a/source/thead_rvv/int8/maxpool.c
+++ b/source/thead_rvv/int8/maxpool.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                                 struct csinn_pool_params *params)
@@ -48,6 +48,8 @@ int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor
         if (shl_is_first_layer_input(input, sess)) {
             elempack = 1;
         }
+    } else if (sess->base_run_mode == CSINN_RM_LAYER) {
+        elempack = in_c % packn == 0 ? packn : 1;
     }
 
     // global maxpool2d // TODO: remove
diff --git a/source/thead_rvv/int8/maxpool_2x2_int8.c b/source/thead_rvv/int8/maxpool_2x2_int8.c
index e5ee48df..8b7a1a41 100644
--- a/source/thead_rvv/int8/maxpool_2x2_int8.c
+++ b/source/thead_rvv/int8/maxpool_2x2_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /****************************************************************************
  * note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/int8/maxpool_2x2_int8_packn.c b/source/thead_rvv/int8/maxpool_2x2_int8_packn.c
index 86049e67..9f0b6379 100644
--- a/source/thead_rvv/int8/maxpool_2x2_int8_packn.c
+++ b/source/thead_rvv/int8/maxpool_2x2_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/int8/maxpool_3x3_int8.c b/source/thead_rvv/int8/maxpool_3x3_int8.c
index c0191158..39b818b0 100644
--- a/source/thead_rvv/int8/maxpool_3x3_int8.c
+++ b/source/thead_rvv/int8/maxpool_3x3_int8.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /****************************************************************************
  * note: VLEN = 128/256 ...
diff --git a/source/thead_rvv/int8/maxpool_3x3_int8_packn.c b/source/thead_rvv/int8/maxpool_3x3_int8_packn.c
index 2e4c63b1..e6734a05 100644
--- a/source/thead_rvv/int8/maxpool_3x3_int8_packn.c
+++ b/source/thead_rvv/int8/maxpool_3x3_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/int8/maxpool_int8_nhwc.c b/source/thead_rvv/int8/maxpool_int8_nhwc.c
index 11712e57..a6589111 100644
--- a/source/thead_rvv/int8/maxpool_int8_nhwc.c
+++ b/source/thead_rvv/int8/maxpool_int8_nhwc.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/int8/maxpool_int8_packn.c b/source/thead_rvv/int8/maxpool_int8_packn.c
index 61205055..eec93734 100644
--- a/source/thead_rvv/int8/maxpool_int8_packn.c
+++ b/source/thead_rvv/int8/maxpool_int8_packn.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * note: support flexible vlen
diff --git a/source/thead_rvv/int8/mul.c b/source/thead_rvv/int8/mul.c
index d67c9454..7f0bdb65 100644
--- a/source/thead_rvv/int8/mul.c
+++ b/source/thead_rvv/int8/mul.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /************************************************************************************
  * (1) s2*(q2-z2) = s0*(q0-z0) * s1*(q1-z1)
@@ -104,7 +104,7 @@ static void elementwise_mul_int8_trans_fp16(struct csinn_tensor *input0,
 /************************************************************************************
  * (1) q2 = [ (q0-z0) * (q1-z1) * (s0*s1/s2) ] + z2
  * (2) q2 = (q0-z0) + z2
- * (3) ps: (q1-z1) * (s0*s1/s2) = 1
+ * (3) ps: (q1-z1) * (s0*s1/s2) = 1(z1<0) or -1(z1>0)
  ***********************************************************************************/
 static void broadcast_single_1_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
                                         struct csinn_tensor *output)
@@ -113,6 +113,7 @@ static void broadcast_single_1_mul_int8(struct csinn_tensor *input0, struct csin
     int8_t *input1_data = (int8_t *)input1->data;
     int8_t *output_data = (int8_t *)output->data;
     int32_t zero_point0 = input0->qinfo->zero_point;
+    int32_t zero_point1 = input1->qinfo->zero_point;
     int32_t zero_point2 = output->qinfo->zero_point;
 
     int64_t size = csinn_tensor_size(output);
@@ -120,6 +121,9 @@ static void broadcast_single_1_mul_int8(struct csinn_tensor *input0, struct csin
         int vl = vsetvl_e8m1(size);
         vint8m1_t _in0 = vle8_v_i8m1(input0_data, vl);
         vint16m2_t _q1_z1 = vwsub_vx_i16m2(_in0, zero_point0, vl);
+        if (zero_point1 > 0) {
+            _q1_z1 = vneg_v_i16m2(_q1_z1, vl);
+        }
         vint16m2_t _res0 = vadd_vx_i16m2(_q1_z1, zero_point2, vl);
         vint8m1_t _res1 = vnclip_wx_i8m1(_res0, 0, vl);
         vse8_v_i8m1(output_data, _res1, vl);
@@ -129,6 +133,90 @@ static void broadcast_single_1_mul_int8(struct csinn_tensor *input0, struct csin
     }
 }
 
+static inline void mul_vv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float real_scale = scale[0] * scale[1] / scale[2];
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _mulf = vfmul_vv_f16m2(_a_f, _b_f, vl);
+        _mulf = vfmul_vf_f16m2(_mulf, real_scale, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_mulf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void mul_vx_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float real_scale = scale[0] * scale[1] / scale[2];
+    float b_f = in1[0] - z1;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _mulf = vfmul_vf_f16m2(_a_f, b_f, vl);
+        _mulf = vfmul_vf_f16m2(_mulf, real_scale, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_mulf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void mul_xv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float real_scale = scale[0] * scale[1] / scale[2];
+    float a_f = in0[0] - z0;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _mulf = vfmul_vf_f16m2(_b_f, a_f, vl);
+        _mulf = vfmul_vf_f16m2(_mulf, real_scale, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_mulf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *mul_cb_int8[] = {
+    [CSINN_BROADCAST_VV] = mul_vv_i8_trans_f16,
+    [CSINN_BROADCAST_VS] = mul_vx_i8_trans_f16,
+    [CSINN_BROADCAST_SV] = mul_xv_i8_trans_f16,
+};
+
 int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
@@ -158,8 +246,7 @@ int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
         }
         broadcast_single_1_mul_int8(input0, input1, output);
     } else {
-        /* TODO: recursive opt */
-        return shl_ref_mul_quant(input0, input1, output, params);
+        return shl_rvv_binary_op_broadcast_int8(input0, input1, output, mul_cb_int8);
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/int8/pad.c b/source/thead_rvv/int8/pad.c
index fc8d3490..615a761c 100644
--- a/source/thead_rvv/int8/pad.c
+++ b/source/thead_rvv/int8/pad.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
  * params:
diff --git a/source/thead_rvv/int8/prelu.c b/source/thead_rvv/int8/prelu.c
index d9841044..e0f0b918 100644
--- a/source/thead_rvv/int8/prelu.c
+++ b/source/thead_rvv/int8/prelu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*********************************************************************
  * s3 * (q3 - z3) = prelu{ s1 * (q1 - z1), s2 * (q2 - z2) }
diff --git a/source/thead_rvv/int8/reduce_sum.c b/source/thead_rvv/int8/reduce_sum.c
index 3aba172c..671ccdcf 100644
--- a/source/thead_rvv/int8/reduce_sum.c
+++ b/source/thead_rvv/int8/reduce_sum.c
@@ -16,11 +16,15 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int shl_rvv_reduce_sum_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                             struct csinn_reduce_params *params)
 {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
+    }
+
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
 
diff --git a/source/thead_rvv/int8/relu.c b/source/thead_rvv/int8/relu.c
index 2615bf66..01d7bf83 100644
--- a/source/thead_rvv/int8/relu.c
+++ b/source/thead_rvv/int8/relu.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /************************************************************************************
  * s2(q2 - z2) = relu{ s1(q1 - z1) }
diff --git a/source/thead_rvv/int8/relu6.c b/source/thead_rvv/int8/relu6.c
new file mode 100644
index 00000000..2f365ea0
--- /dev/null
+++ b/source/thead_rvv/int8/relu6.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_relu6_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
+{
+    return shl_rvv_siso_callback_base(input, output, params, shl_rvv_relu6_fp32);
+}
diff --git a/source/thead_rvv/int8/reshape.c b/source/thead_rvv/int8/reshape.c
index c291dd5b..3e73c1f9 100644
--- a/source/thead_rvv/int8/reshape.c
+++ b/source/thead_rvv/int8/reshape.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /*************************************************************
     note: support flexible vlen
@@ -29,7 +29,7 @@ int shl_rvv_reshape_int8(struct csinn_tensor *input, struct csinn_tensor *output
     int8_t *output_data = (int8_t *)output->data;
 
     shl_gref_reshape_infer_shape(input, output, params);
-    if (input->layout >= CSINN_LAYOUT_NC1WC0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
         const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
         const int vl = vsetvl_e8m1(packn);
         int outer_size = input->dim[0] * input->dim[1];  // batch fuse to outer
diff --git a/source/thead_rvv/int8/sigmoid.c b/source/thead_rvv/int8/sigmoid.c
new file mode 100644
index 00000000..ad19e3ea
--- /dev/null
+++ b/source/thead_rvv/int8/sigmoid.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_sigmoid_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params)
+{
+    return shl_rvv_siso_callback_base(input, output, params, shl_rvv_sigmoid_fp32);
+}
diff --git a/source/thead_rvv/int8/softmax.c b/source/thead_rvv/int8/softmax.c
new file mode 100644
index 00000000..810b0078
--- /dev/null
+++ b/source/thead_rvv/int8/softmax.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+int shl_rvv_softmax_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params)
+{
+    return shl_rvv_siso_callback_base(input, output, params, shl_rvv_softmax_fp32);
+}
diff --git a/source/thead_rvv/int8/sub.c b/source/thead_rvv/int8/sub.c
new file mode 100644
index 00000000..287e861a
--- /dev/null
+++ b/source/thead_rvv/int8/sub.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rvv/rvv.h"
+
+static inline void sub_vv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    float s0_s2 = scale[0] / scale[2];
+    float s1_s2 = scale[1] / scale[2];
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _tmp0 = vfmul_vf_f16m2(_a_f, s0_s2, vl);  // s0/s2(q0-z0)
+        vfloat16m2_t _tmp1 = vfmul_vf_f16m2(_b_f, s1_s2, vl);  // s1/s2(q1-z1)
+        vfloat16m2_t _subf = vfsub_vv_f16m2(_tmp0, _tmp1, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_subf, vl);
+        _res = vadd_vx_i16m2(_res, z2, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void sub_vx_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    float s0_s2 = scale[0] / scale[2];
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float q1_z1 = scale[1] / scale[2] * (in1[0] - z1);  // s1/s2(q1-z1)
+    float q1_z1_z2 = q1_z1 - z2;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _a = vle8_v_i8m1(in0, vl);
+        vint16m2_t _a_w = vwsub_vx_i16m2(_a, z0, vl);
+        vfloat16m2_t _a_f = vfcvt_f_x_v_f16m2(_a_w, vl);
+        vfloat16m2_t _tmp0 = vfmul_vf_f16m2(_a_f, s0_s2, vl);  // s0/s2(q0-z0)
+        vfloat16m2_t _subf = vfsub_vf_f16m2(_tmp0, q1_z1_z2, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_subf, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in0 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+static inline void sub_xv_i8_trans_f16(int8_t *in0, int8_t *in1, int8_t *out, int32_t size,
+                                       float *scale, int32_t *zero_point)
+{
+    float s1_s2 = scale[1] / scale[2];
+    int32_t z0 = zero_point[0];
+    int32_t z1 = zero_point[1];
+    int32_t z2 = zero_point[2];
+    float q0_z0 = scale[0] / scale[2] * (in0[0] - z0);  // s0/s2(q0-z0)
+    float q0_z0_z2 = q0_z0 + z2;
+
+    while (size > 0) {
+        int vl = vsetvl_e8m1(size);
+        vint8m1_t _b = vle8_v_i8m1(in1, vl);
+        vint16m2_t _b_w = vwsub_vx_i16m2(_b, z1, vl);
+        vfloat16m2_t _b_f = vfcvt_f_x_v_f16m2(_b_w, vl);
+        vfloat16m2_t _tmp1 = vfmul_vf_f16m2(_b_f, s1_s2, vl);  // s1/s2(q1-z1)
+        vfloat16m2_t _subf = vfrsub_vf_f16m2(_tmp1, q0_z0_z2, vl);
+        vint16m2_t _res = vfcvt_x_f_v_i16m2(_subf, vl);
+        vse8_v_i8m1(out, vnclip_wx_i8m1(_res, 0, vl), vl);
+        in1 += vl;
+        out += vl;
+        size -= vl;
+    }
+}
+
+void *sub_cb_int8[] = {
+    [CSINN_BROADCAST_VV] = sub_vv_i8_trans_f16,
+    [CSINN_BROADCAST_VS] = sub_vx_i8_trans_f16,
+    [CSINN_BROADCAST_SV] = sub_xv_i8_trans_f16,
+};
+
+int shl_rvv_sub_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    return shl_rvv_binary_op_broadcast_int8(input0, input1, output, sub_cb_int8);
+}
diff --git a/source/thead_rvv/int8/transpose.c b/source/thead_rvv/int8/transpose.c
index 1266c5e5..7f4c19bd 100644
--- a/source/thead_rvv/int8/transpose.c
+++ b/source/thead_rvv/int8/transpose.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 static void transpose_021_int8(int8_t *src, int8_t *dst, int batch, int inner_size, int outer_size)
 {
@@ -89,9 +89,10 @@ static int transpose_tail_coincide_int8(struct csinn_tensor *input, struct csinn
 int shl_rvv_transpose_int8(struct csinn_tensor *input, struct csinn_tensor *output,
                            struct csinn_transpose_params *params)
 {
-    if (input->layout >= CSINN_LAYOUT_NC1WC0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
-        return shl_ref_transpose_quant(input, output, params);
+    if (input->layout >= CSINN_LAYOUT_NC1C0 && input->layout <= CSINN_LAYOUT_NC1DHWC0) {
+        shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(input);
     }
+
     if (params->permute_num == 4 && params->permute[0] == 0 && params->permute[1] == 1 &&
         params->permute[2] == 2 && params->permute[3] == 3) {
         int8_t *input_data = (int8_t *)input->data;
diff --git a/source/thead_rvv/reorder.c b/source/thead_rvv/reorder.c
index 1c00bcc2..633913a8 100644
--- a/source/thead_rvv/reorder.c
+++ b/source/thead_rvv/reorder.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 /************************************************************************
  * pack1ton: change input(activation) layout from nchw to nc1hwc0
@@ -2338,21 +2338,16 @@ static inline void reorder_kernel_12xk_fp32(float *src, float *dst, int M_BLOCK,
 /*************************************************************
  * src: [m, k]
  * dst: [m/m_blk, k/k_blk, m_blk/12, 12, k_blk]
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 12
+ * m_blk: M_BLK, M_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_rvv_reorder_kernel_block_12xk_fp32(float *src, float *dst, int m, int k, const int M_BLK,
                                             const int K_BLK)
 {
-    const int MIN_M_BLK = 12;
-
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
         int k_block = K_BLK;
@@ -2497,21 +2492,16 @@ static inline void reorder_kernel_12xk_fp16(__fp16 *src, __fp16 *dst, int M_BLOC
 /*************************************************************
  * src: [m, k]
  * dst: [m/m_blk, k/k_blk, m_blk/12, 12, k_blk]
- * m_blk: M_BLK, M_BLK/2, M_BLK/4, ..., 12
+ * m_blk: M_BLK, M_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_rvv_reorder_kernel_block_12xk_fp16(__fp16 *src, __fp16 *dst, int m, int k, const int M_BLK,
                                             const int K_BLK)
 {
-    const int MIN_M_BLK = 12;
-
     int m_block = M_BLK;
     int m_idx = 0;
     while (m_idx < m) {
-        while (!(m_idx + m_block - 1 < m)) {
-            m_block /= 2;
-        }
-        if (m_block < MIN_M_BLK) {
+        if (m - m_idx < m_block) {
             m_block = m - m_idx;
         }
         int k_block = K_BLK;
@@ -2569,15 +2559,12 @@ static inline void reorder_input_pack2nxk_fp32(float *src, float *dst, int N_BLO
  * packn = vlenb / sizeof(float)
  * src: [k, n]
  * dst: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n]
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_rvv_reorder_input_block_pack2nxk_fp32(float *src, float *dst, int k, int n,
                                                const int K_BLK, const int N_BLK)
 {
-    const int packn = csrr_vlenb() / sizeof(float);
-    const int MIN_N_BLK = packn * 2;
-
     int k_block = K_BLK;
     int k_idx = 0;
     while (k_idx < k) {
@@ -2587,10 +2574,7 @@ void shl_rvv_reorder_input_block_pack2nxk_fp32(float *src, float *dst, int k, in
         int n_block = N_BLK;
         int n_idx = 0;
         while (n_idx < n) {
-            while (!(n_idx + n_block - 1 < n)) {
-                n_block /= 2;
-            }
-            if (n_block < MIN_N_BLK) {
+            if (n - n_idx < n_block) {
                 n_block = n - n_idx;
             }
             float *s_ptr = src + k_idx * n + n_idx;
@@ -2642,15 +2626,12 @@ static inline void reorder_input_pack2nxk_fp16(__fp16 *src, __fp16 *dst, int N_B
  * packn = vlenb / sizeof(__fp16)
  * src: [k, n]
  * dst: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n]
- * n_blk: N_BLK, N_BLK/2, N_BLK/4, ..., pack2n
+ * n_blk: N_BLK, N_tail
  * k_blk: K_BLK, K_tail
  ************************************************************/
 void shl_rvv_reorder_input_block_pack2nxk_fp16(__fp16 *src, __fp16 *dst, int k, int n,
                                                const int K_BLK, const int N_BLK)
 {
-    const int packn = csrr_vlenb() / sizeof(__fp16);
-    const int MIN_N_BLK = packn * 2;
-
     int k_block = K_BLK;
     int k_idx = 0;
     while (k_idx < k) {
@@ -2660,10 +2641,7 @@ void shl_rvv_reorder_input_block_pack2nxk_fp16(__fp16 *src, __fp16 *dst, int k,
         int n_block = N_BLK;
         int n_idx = 0;
         while (n_idx < n) {
-            while (!(n_idx + n_block - 1 < n)) {
-                n_block /= 2;
-            }
-            if (n_block < MIN_N_BLK) {
+            if (n - n_idx < n_block) {
                 n_block = n - n_idx;
             }
             __fp16 *s_ptr = src + k_idx * n + n_idx;
diff --git a/source/thead_rvv/setup.c b/source/thead_rvv/setup.c
index 58a7b01e..005d28ba 100644
--- a/source/thead_rvv/setup.c
+++ b/source/thead_rvv/setup.c
@@ -16,10 +16,10 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
-#include "shl_thead_rvv_cap.h"
+#include "rvv/cap.h"
+#include "rvv/rvv.h"
 
-#define RVV_OP_PATTERN_MAX 80
+#define RVV_OP_PATTERN_MAX 100
 static struct shl_cb_table shl_rvv_cb_table[RVV_OP_PATTERN_MAX];
 
 void shl_rvv_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec,
@@ -94,6 +94,18 @@ void __attribute__((weak)) shl_target_init_rvv()
                    shl_rvv_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu6,
                    shl_rvv_depthwise_conv2d_cap);
 #endif
+#ifndef CONFIG_THEAD_RVV_DECONVOLUTION_FP32_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DECONV2D, shl_rvv_deconv2d_init_fp32, NULL,
+                   shl_gref_deconv2d, shl_rvv_deconv2d_cap);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_DECONV2D, shl_rvv_deconv2d_init_fp32, NULL,
+                   shl_gref_group_deconv2d, shl_rvv_deconv2d_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_DECONVOLUTION_FP16_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DECONV2D, shl_rvv_deconv2d_init_fp16, NULL,
+                   shl_gref_deconv2d, shl_rvv_deconv2d_cap);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_DECONV2D, shl_rvv_deconv2d_init_fp16, NULL,
+                   shl_gref_group_deconv2d, shl_rvv_deconv2d_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_MAXPOOL_FP32_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_fp32, NULL,
                    shl_gref_maxpool2d, shl_rvv_maxpool2d_cap);
@@ -142,6 +154,18 @@ void __attribute__((weak)) shl_target_init_rvv()
     shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_ADD, NULL, shl_rvv_add_int8, shl_gref_add,
                    shl_rvv_add_cap);
 #endif
+#ifndef CONFIG_THEAD_RVV_SUB_FP32_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, NULL, shl_rvv_sub_fp32, shl_gref_sub,
+                   shl_rvv_sub_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_SUB_FP16_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, NULL, shl_rvv_sub_fp16, shl_gref_sub,
+                   shl_rvv_sub_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_SUB_INT8_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SUB, NULL, shl_rvv_sub_int8, shl_gref_sub,
+                   shl_rvv_sub_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_MUL_FP32_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, NULL, shl_rvv_mul_fp32, shl_gref_mul,
                    shl_rvv_mul_cap);
@@ -154,6 +178,18 @@ void __attribute__((weak)) shl_target_init_rvv()
     shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MUL, NULL, shl_rvv_mul_int8, shl_gref_mul,
                    shl_rvv_mul_cap);
 #endif
+#ifndef CONFIG_THEAD_RVV_DIV_FP32_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, NULL, shl_rvv_div_fp32, shl_gref_div,
+                   shl_rvv_div_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_DIV_FP16_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, NULL, shl_rvv_div_fp16, shl_gref_div,
+                   shl_rvv_div_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_DIV_INT8_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DIV, NULL, shl_rvv_div_int8, shl_gref_div,
+                   shl_rvv_div_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_CONCAT_FP32_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, NULL, shl_rvv_concat_fp32, shl_gref_concat,
                    shl_rvv_concat_cap);
@@ -198,6 +234,10 @@ void __attribute__((weak)) shl_target_init_rvv()
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, NULL, shl_rvv_relu6_fp16, shl_gref_relu6,
                    shl_rvv_relu6_cap);
 #endif
+#ifndef CONFIG_THEAD_RVV_RELU6_INT8_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RELU6, NULL, shl_rvv_relu6_int8, shl_gref_relu6,
+                   shl_rvv_relu6_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_GLOBAL_AVERAGEPOOL_FP32_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D,
                    shl_rvv_global_avgpool2d_init_fp32, NULL, shl_gref_global_avgpool2d,
@@ -246,6 +286,10 @@ void __attribute__((weak)) shl_target_init_rvv()
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGMOID, NULL, shl_rvv_sigmoid_fp16,
                    shl_gref_sigmoid, shl_rvv_sigmoid_cap);
 #endif
+#ifndef CONFIG_THEAD_RVV_SIGMOID_INT8_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SIGMOID, NULL, shl_rvv_sigmoid_int8, shl_gref_sigmoid,
+                   shl_rvv_sigmoid_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_SOFTMAX_FP32_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_fp32,
                    shl_gref_softmax, shl_rvv_softmax_cap);
@@ -254,6 +298,10 @@ void __attribute__((weak)) shl_target_init_rvv()
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_fp16,
                    shl_gref_softmax, shl_rvv_softmax_cap);
 #endif
+#ifndef CONFIG_THEAD_RVV_SOFTMAX_INT8_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_int8, shl_gref_softmax,
+                   shl_rvv_softmax_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_REDUCE_SUM_INT8_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_REDUCE_SUM, NULL, shl_rvv_reduce_sum_int8,
                    shl_gref_reduce_sum, shl_rvv_reduce_sum_cap);
@@ -279,8 +327,8 @@ void __attribute__((weak)) shl_target_init_rvv()
                    shl_gref_layer_norm, shl_rvv_layer_norm_cap);
 #endif
 #ifndef CONFIG_THEAD_RVV_LAYER_NORM_INT8_DISABLED
-    // shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_LAYER_NORM, NULL, shl_rvv_layer_norm_int8,
-    //                shl_gref_layer_norm, shl_rvv_layer_norm_cap);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_LAYER_NORM, NULL, shl_rvv_layer_norm_int8,
+                   shl_gref_layer_norm, shl_rvv_layer_norm_cap);
 #endif
 #ifndef CONFIG_THEAD_RVV_CLIP_FP32_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, NULL, shl_rvv_clip_fp32, shl_gref_clip,
@@ -294,10 +342,23 @@ void __attribute__((weak)) shl_target_init_rvv()
     shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CLIP, NULL, shl_rvv_clip_int8, shl_gref_clip,
                    shl_rvv_clip_cap);
 #endif
+
+#ifndef CONFIG_THEAD_RVV_CONVOLUTION1D_FP32_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_rvv_conv1d_init_fp32, NULL,
+                   shl_gref_conv1d, shl_rvv_conv1d_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_CONVOLUTION1D_FP16_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_rvv_conv1d_init_fp16, NULL,
+                   shl_gref_conv1d, shl_rvv_conv1d_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_CONVOLUTION1D_INT8_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV1D, shl_rvv_conv1d_init_int8, NULL,
                    shl_gref_conv1d, shl_rvv_conv1d_cap);
 #endif
+#ifndef CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV1D, shl_rvv_conv1d_init_int8, NULL,
+                   shl_gref_depthwise_conv1d, shl_rvv_conv1d_cap);
+#endif
 #ifndef CONFIG_THEAD_RVV_CONVOLUTION_INT8_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_rvv_conv2d_init_int8, NULL,
                    shl_gref_conv2d, shl_rvv_conv2d_cap);
@@ -331,8 +392,12 @@ void __attribute__((weak)) shl_target_init_rvv()
                    shl_gref_matmul, shl_rvv_matmul_cap);
 #endif
 #ifndef CONFIG_THEAD_RVV_MATMUL_INT8_DISABLED
-    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MATMUL, NULL, shl_rvv_matmul_int8, shl_gref_matmul,
-                   shl_rvv_matmul_cap);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MATMUL, shl_rvv_matmul_init_int8, NULL,
+                   shl_gref_matmul, shl_rvv_matmul_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_GATHER_FP32_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GATHER, NULL, shl_rvv_gather_fp32, shl_gref_gather,
+                   shl_rvv_gather_cap);
 #endif
 #ifndef CONFIG_THEAD_RVV_GATHER_FP16_DISABLED
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, NULL, shl_rvv_gather_fp16, shl_gref_gather,
@@ -346,6 +411,18 @@ void __attribute__((weak)) shl_target_init_rvv()
     shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_STRIDED_SLICE, NULL, shl_rvv_strided_slice_fp16,
                    shl_gref_strided_slice, NULL);
 #endif
+#ifndef CONFIG_THEAD_RVV_ERF_FP32_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ERF, NULL, shl_rvv_erf_fp32, shl_gref_erf,
+                   shl_rvv_erf_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_ERF_FP16_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ERF, NULL, shl_rvv_erf_fp16, shl_gref_erf,
+                   shl_rvv_erf_cap);
+#endif
+#ifndef CONFIG_THEAD_RVV_ERF_INT8_DISABLED
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_ERF, NULL, shl_rvv_erf_int8, shl_gref_erf,
+                   shl_rvv_erf_cap);
+#endif
 
 #ifdef SHL_USE_DOT_INT4
 #ifndef CONFIG_THEAD_RVV_CONVOLUTION_INT4_DISABLED
diff --git a/source/thead_rvv/utils.c b/source/thead_rvv/utils.c
index e345db5c..ff43827f 100644
--- a/source/thead_rvv/utils.c
+++ b/source/thead_rvv/utils.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-#include "shl_thead_rvv.h"
+#include "rvv/rvv.h"
 
 int csrr_vl()
 {
@@ -387,20 +387,52 @@ void shl_rvv_tensor_nc1xc0_to_ndarray_inplace_int8(struct csinn_tensor *t)
 }
 
 /********************* for fp16 quantization *********************/
+// for requantization, different scales are also suitable for quantization
+void shl_rvv_requantize_fp16(__fp16 *src, __fp16 scale, int size)
+{
+    while (size > 0) {
+        int vl = vsetvl_e16m4(size);
+        vfloat16m4_t _val = vle16_v_f16m4(src, vl);
+        _val = vfmul_vf_f16m4(_val, scale, vl);
+        vse16_v_f16m4(src, _val, vl);
+        src += vl;
+        size -= vl;
+    }
+}
 
 void shl_rvv_sidcso_op_requantize_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
                                        struct csinn_tensor *kernel)
 {
+    float s1 = input->qinfo->scale;
+    float s2 = kernel->qinfo->scale;
+    float s3 = output->qinfo->scale;
+
+    if (fabs(s1 - 1) > FLT_EPSILON || fabs(s2 - 1) > FLT_EPSILON || fabs(s3 - 1) > FLT_EPSILON) {
+        shl_debug_info("fp16 quantization of sidcso op\n");
+        shl_rvv_requantize_fp16(output->data, s1 * s2 / s3, csinn_tensor_size(output));
+    }
 }
 
 /* linear calculations ops, such as relu, leaky_relu, prelu, etc. */
 void shl_rvv_siso_op_requantize_fp16(struct csinn_tensor *input, struct csinn_tensor *output)
 {
+    float s1 = input->qinfo->scale;
+    float s2 = output->qinfo->scale;
+    if (fabs(s1 - s2) > FLT_EPSILON) {
+        shl_debug_info("fp16 quantization of siso op\n");
+        shl_rvv_requantize_fp16(output->data, s1 / s2, csinn_tensor_size(output));
+    }
 }
 
 void shl_rvv_diso_op_requantize_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
                                      struct csinn_tensor *output)
 {
+    float s1 = input0->qinfo->scale;
+    float s2 = input1->qinfo->scale;
+    float s3 = output->qinfo->scale;
+    if (fabs(s1 - 1) > FLT_EPSILON || fabs(s2 - 1) > FLT_EPSILON || fabs(s3 - 1) > FLT_EPSILON) {
+        shl_debug_error("unsupport fp16 quantization of diso op\n");
+    }
 }
 
 /********************* for int8 quantization *********************/
@@ -462,6 +494,14 @@ void shl_rvv_dequantize_i8_to_f16(int8_t *src, __fp16 *dst, int size, int32_t zp
     }
 }
 
+vfloat16m2_t shl_rvv_vdeq_vv_f16m2(vint8m1_t _i8, vint8m1_t _z, vfloat16m2_t _s, int vl)
+{
+    vint16m2_t _i16 = vwsub_vv_i16m2(_i8, _z, vl);
+    vfloat16m2_t _f16 = vfcvt_f_x_v_f16m2(_i16, vl);
+    _f16 = vfmul_vv_f16m2(_f16, _s, vl);
+    return _f16;
+}
+
 /********************* int4 easter eggs *********************/
 void shl_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
                                        int inw, int padded_h, int padded_w, int pad_top,
@@ -674,19 +714,380 @@ int shl_rvv_avgpool_get_window_size(struct csinn_pool_params *params, int idx_h_
     return window_size;
 }
 
+/*********************************************************************
+ * (q16 - z2) * s2 = (q8 - z1) * s1
+ * q16 = s1/s2 * (q8 - z1) + z2
+ ********************************************************************/
+void shl_rvv_u8_to_i16(const uint8_t *input, int16_t *output, int32_t z1, float *s1, int32_t z2,
+                       float *s2, uint32_t length)
+{
+#ifdef RVV_1_0_0
+    asm volatile(
+        "beqz           %6, 2f\n\t"
+        "flw            ft0, (%4)\n\t"
+        "flw            ft1, (%5)\n\t"
+        "fdiv.s         ft0, ft0, ft1\n\t"  // s1/s2
+        "fcvt.h.s       fa0, ft0\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %6, e8, m2\n\t"
+        "slli           t1, t0, 1\n\t"
+        "vle8.v         v0, (%0)\n\t"
+        "add            %0, %0, t0\n\t"
+
+        "vwaddu.vx      v4, v0, zero\n\t"  // u8 -> u16
+        "vsetvli        t0, %6, e16, m4\n\t"
+        "vsub.vx        v4, v4, %2\n\t"   // -= z1
+        "vfcvt.f.x.v    v8, v4\n\t"       // i16 -> f16
+        "vfmul.vf       v8, v8, fa0\n\t"  // *= s1/s2
+        "vfcvt.x.f.v    v4, v8\n\t"       // f16 -> i16
+        "vadd.vx        v4, v4, %3\n\t"   // += z2
+
+        "vse16.v        v4, (%1)\n\t"
+        "add            %1, %1, t1\n\t"
+        "sub            %6, %6, t0\n\t"
+        "bgtz           %6, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(z1),      // %2
+          "=r"(z2),      // %3
+          "=r"(s1),      // %4
+          "=r"(s2),      // %5
+          "=r"(length)   // %6
+        : "0"(input), "1"(output), "2"(z1), "3"(z2), "4"(s1), "5"(s2), "6"(length)
+        : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "t0", "t1", "t2", "ft0",
+          "ft1", "fa0");
+#elif defined RVV_0_7_1
+    asm volatile(
+        "beqz           %6, 2f\n\t"
+        "flw            ft0, (%4)\n\t"
+        "flw            ft1, (%5)\n\t"
+        "fdiv.s         ft0, ft0, ft1\n\t"  // s1/s2
+        "fcvt.h.s       fa0, ft0\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %6, e8, m2\n\t"
+        "slli           t1, t0, 1\n\t"
+        "vle.v          v0, (%0)\n\t"
+        "add            %0, %0, t0\n\t"
+
+        "vwaddu.vx      v4, v0, zero\n\t"  // u8->u16
+        "vsetvli        t0, %6, e16, m4\n\t"
+        "vsub.vx        v4, v4, %2\n\t"   // -= z1
+        "vfcvt.f.x.v    v8, v4\n\t"       // i16 -> f16
+        "vfmul.vf       v8, v8, fa0\n\t"  // *= s1/s2
+        "vfcvt.x.f.v    v4, v8\n\t"       // f16 -> i16
+        "vadd.vx        v4, v4, %3\n\t"   // += z2
+
+        "vse.v          v4, (%1)\n\t"
+        "add            %1, %1, t1\n\t"
+        "sub            %6, %6, t0\n\t"
+        "bgtz           %6, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(z1),      // %2
+          "=r"(z2),      // %3
+          "=r"(s1),      // %4
+          "=r"(s2),      // %5
+          "=r"(length)   // %6
+        : "0"(input), "1"(output), "2"(z1), "3"(z2), "4"(s1), "5"(s2), "6"(length)
+        : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "t0", "t1", "t2", "ft0",
+          "ft1", "fa0");
+#endif
+}
+
+/*********************************************************************
+ * (q8 - z2) * s2 = (q16 - z1) * s1
+ * q(8) = s1/s2 * (q16 - z1) + z2
+ ********************************************************************/
+void shl_rvv_i16_to_u8(const int16_t *input, uint8_t *output, int32_t z1, float *s1, int32_t z2,
+                       float *s2, uint32_t length)
+{
+#ifdef RVV_1_0_0
+    asm volatile(
+        "beqz           %6, 2f\n\t"
+        "flw            ft0, (%4)\n\t"
+        "flw            ft1, (%5)\n\t"
+        "fdiv.s         ft0, ft0, ft1\n\t"  // s1/s2
+        "fcvt.h.s       fa0, ft0\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %6, e16, m4\n\t"
+        "slli           t1, t0, 1\n\t"
+        "vle16.v        v4, (%0)\n\t"
+        "add            %0, %0, t1\n\t"
+
+        "vsub.vx        v4, v4, %2\n\t"   // -= z1
+        "vfcvt.f.x.v    v8, v4\n\t"       // i16 -> f16
+        "vfmul.vf       v8, v8, fa0\n\t"  // *= s1/s2
+        "vfcvt.x.f.v    v4, v8\n\t"       // f16 -> i16
+        "vadd.vx        v4, v4, %3\n\t"   // += z2
+        "vmax.vx        v4, v4, zero\n\t"
+        "vsetvli        t0, %6, e8, m2\n\t"
+        "vnclipu.wi     v0, v4, 0\n\t"  // i16 -> u8
+
+        "vse8.v         v0, (%1)\n\t"
+        "add            %1, %1, t0\n\t"
+        "sub            %6, %6, t0\n\t"
+        "bgtz           %6, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(z1),      // %2
+          "=r"(z2),      // %3
+          "=r"(s1),      // %4
+          "=r"(s2),      // %5
+          "=r"(length)   // %6
+        : "0"(input), "1"(output), "2"(z1), "3"(z2), "4"(s1), "5"(s2), "6"(length)
+        : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "t0", "t1", "t2", "ft0",
+          "ft1", "fa0");
+#elif defined RVV_0_7_1
+    asm volatile(
+        "beqz           %6, 2f\n\t"
+        "flw            ft0, (%4)\n\t"
+        "flw            ft1, (%5)\n\t"
+        "fdiv.s         ft0, ft0, ft1\n\t"  // s1/s2
+        "fcvt.h.s       fa0, ft0\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %6, e16, m4\n\t"
+        "slli           t1, t0, 1\n\t"
+        "vle.v          v4, (%0)\n\t"
+        "add            %0, %0, t1\n\t"
+
+        "vsub.vx        v4, v4, %2\n\t"   // -= z1
+        "vfcvt.f.x.v    v8, v4\n\t"       // i16 -> f16
+        "vfmul.vf       v8, v8, fa0\n\t"  // *= s1/s2
+        "vfcvt.x.f.v    v4, v8\n\t"       // f16 -> i16
+        "vadd.vx        v4, v4, %3\n\t"   // += z2
+        "vmax.vx        v4, v4, zero\n\t"
+        "vsetvli        t0, %6, e8, m2\n\t"
+        "vnclipu.vi     v0, v4, 0\n\t"  // u16(i16)->u8
+
+        "vse.v          v0, (%1)\n\t"
+        "add            %1, %1, t0\n\t"
+        "sub            %6, %6, t0\n\t"
+        "bgtz           %6, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(z1),      // %2
+          "=r"(z2),      // %3
+          "=r"(s1),      // %4
+          "=r"(s2),      // %5
+          "=r"(length)   // %6
+        : "0"(input), "1"(output), "2"(z1), "3"(z2), "4"(s1), "5"(s2), "6"(length)
+        : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "t0", "t1", "t2", "ft0",
+          "ft1", "fa0");
+#endif
+}
+
+/*********************************************************************
+ * r = (q - z) * s
+ ********************************************************************/
+void shl_rvv_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale,
+                       uint32_t length)
+{
+#ifdef RVV_1_0_0
+    asm volatile(
+        "beqz           %4, 2f\n\t"
+        "flw            fa0, (%3)\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %4, e8, m1\n\t"
+        "slli           t1, t0, 2\n\t"
+        "vle8.v         v0, (%0)\n\t"
+        "add            %0, %0, t0\n\t"
+
+        "vwaddu.vx      v2, v0, zero\n\t"  // u8 -> u16
+        "vsetvli        t0, %4, e16, m2\n\t"
+        "vwsub.vx       v4, v2, %2\n\t"  // i16(u16) - z -> i32
+        "vsetvli        t0, %4, e32, m4\n\t"
+        "vfcvt.f.x.v    v8, v4\n\t"       // i32 -> f32
+        "vfmul.vf       v4, v8, fa0\n\t"  // *= scale
+        "vse32.v        v4, (%1)\n\t"
+        "add            %1, %1, t1\n\t"
+
+        "sub            %4, %4, t0\n\t"
+        "bgtz           %4, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(offset),  // %2
+          "=r"(scale),   // %3
+          "=r"(length)   // %4
+        : "0"(input), "1"(output), "2"(offset), "3"(scale), "4"(length)
+        : "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "fa0", "t0", "t1");
+#elif defined RVV_0_7_1
+    asm volatile(
+        "beqz           %4, 2f\n\t"
+        "flw            fa0, (%3)\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %4, e8, m1\n\t"
+        "slli           t1, t0, 2\n\t"
+        "vle.v          v0, (%0)\n\t"
+        "add            %0, %0, t0\n\t"
+
+        "vwaddu.vx      v2, v0, zero\n\t"  // u8 -> u16
+        "vsetvli        t0, %4, e16, m2\n\t"
+        "vwsub.vx       v4, v2, %2\n\t"  // i16(u16) - z -> i32
+        "vsetvli        t0, %4, e32, m4\n\t"
+        "vfcvt.f.x.v    v8, v4\n\t"       // i32 -> f32
+        "vfmul.vf       v4, v8, fa0\n\t"  // *= scale
+        "vse.v          v4, (%1)\n\t"
+        "add            %1, %1, t1\n\t"
+
+        "sub            %4, %4, t0\n\t"
+        "bgtz           %4, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(offset),  // %2
+          "=r"(scale),   // %3
+          "=r"(length)   // %4
+        : "0"(input), "1"(output), "2"(offset), "3"(scale), "4"(length)
+        : "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "fa0", "t0", "t1");
+#endif
+}
+
+/*********************************************************************
+ * q = nearbyint(r/s) + z
+ ********************************************************************/
+void shl_rvv_f32_to_u8(const float *input, uint8_t *output, int32_t offset, float *scale,
+                       uint32_t length)
+{
+#ifdef RVV_1_0_0
+    asm volatile(
+        "beqz           %4, 2f\n\t"
+        "flw            fa0, (%3)\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %4, e32, m4\n\t"
+        "slli           t1, t0, 2\n\t"
+        "vle32.v        v0, (%0)\n\t"
+        "add            %0, %0, t1\n\t"
+
+        "vfdiv.vf       v4, v0, fa0\n\t"  // /= scale
+        "vfcvt.x.f.v    v8, v4\n\t"       // f32 -> i32
+        "vadd.vx        v8, v8, %2\n\t"   // += z
+        "vmax.vx        v8, v8, zero\n\t"
+        "vsetvli        t0, %4, e16, m2\n\t"
+        "vnclipu.wi     v2, v8, 0\n\t"  // u32(i32) -> u16
+        "vsetvli        t0, %4, e8, m1\n\t"
+        "vnclipu.wi     v0, v2, 0\n\t"  // u16 -> u8
+
+        "vse8.v         v0, (%1)\n\t"
+        "add            %1, %1, t1\n\t"
+        "sub            %4, %4, t0\n\t"
+        "bgtz           %4, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(offset),  // %2
+          "=r"(scale),   // %3
+          "=r"(length)   // %4
+        : "0"(input), "1"(output), "2"(offset), "3"(scale), "4"(length)
+        : "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "fa0", "t0", "t1");
+#elif defined RVV_0_7_1
+    asm volatile(
+        "beqz           %4, 2f\n\t"
+        "flw            fa0, (%3)\n\t"
+
+        "1:\n\t"
+        "vsetvli        t0, %4, e32, m4\n\t"
+        "slli           t1, t0, 2\n\t"
+        "vle.v          v0, (%0)\n\t"
+        "add            %0, %0, t1\n\t"
+
+        "vfdiv.vf       v4, v0, fa0\n\t"  // /= scale
+        "vfcvt.x.f.v    v8, v4\n\t"       // f32 -> i32
+        "vadd.vx        v8, v8, %2\n\t"   // += z
+        "vmax.vx        v8, v8, zero\n\t"
+        "vsetvli        t0, %4, e16, m2\n\t"
+        "vnclipu.vi     v2, v8, 0\n\t"  // u32(i32) -> u16
+        "vsetvli        t0, %4, e8, m1\n\t"
+        "vnclipu.vi     v0, v2, 0\n\t"  // u16 -> u8
+
+        "vse.v          v0, (%1)\n\t"
+        "add            %1, %1, t1\n\t"
+        "sub            %4, %4, t0\n\t"
+        "bgtz           %4, 1b\n\t"
+
+        "2:\n\t"
+
+        : "=r"(input),   // %0
+          "=r"(output),  // %1
+          "=r"(offset),  // %2
+          "=r"(scale),   // %3
+          "=r"(length)   // %4
+        : "0"(input), "1"(output), "2"(offset), "3"(scale), "4"(length)
+        : "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "fa0", "t0", "t1");
+#endif
+}
+
+void shl_rvv_i8_to_f32(const int8_t *input, float *output, int32_t offset, float *scale,
+                       uint32_t length)
+{
+    while (length > 0) {
+        int vl = vsetvl_e8m1(length);
+        vint8m1_t _i8 = vle8_v_i8m1(input, vl);
+        input += vl;
+        vint16m2_t _i16 = vwadd_vx_i16m2(_i8, 0, vl);
+        vint32m4_t _i32 = vwsub_vx_i32m4(_i16, offset, vl);
+        vfloat32m4_t _f32 = vfcvt_f_x_v_f32m4(_i32, vl);
+        _f32 = vfmul_vf_f32m4(_f32, *scale, vl);
+        vse32_v_f32m4(output, _f32, vl);
+        output += vl;
+        length -= vl;
+    }
+}
+
+void shl_rvv_f32_to_i8(const float *input, int8_t *output, int32_t offset, float *scale,
+                       uint32_t length)
+{
+    float _1_s = 1 / *scale;
+    while (length > 0) {
+        int vl = vsetvl_e32m4(length);
+        vfloat32m4_t _in = vle32_v_f32m4(input, vl);
+        input += vl;
+        vfloat32m4_t _f32 = vfmul_vf_f32m4(_in, _1_s, vl);
+        vint32m4_t _i32 = vfcvt_x_f_v_i32m4(_f32, vl);
+        _i32 = vadd_vx_i32m4(_i32, offset, vl);
+        vint16m2_t _i16 = vnclip_wx_i16m2(_i32, 0, vl);
+        vint8m1_t _i8 = vnclip_wx_i8m1(_i16, 0, vl);
+        vse8_v_i8m1(output, _i8, vl);
+        output += vl;
+        length -= vl;
+    }
+}
+
 void shl_rvv_i16_to_f32(const int16_t *input, float *output, int32_t offset, float *scale,
                         uint32_t length)
 {
-    int vl = vsetvl_e32m4(length);
-    vint16m2_t _z = vmv_v_x_i16m2(offset, vl);
-    vfloat32m4_t _s = vfmv_v_f_f32m4(*scale, vl);
     while (length > 0) {
-        vl = vsetvl_e16m2(length);
+        int vl = vsetvl_e16m2(length);
         vint16m2_t _in = vle16_v_i16m2(input, vl);
         input += vl;
-        vint32m4_t _i32 = vwsub_vv_i32m4(_in, _z, vl);
+        vint32m4_t _i32 = vwsub_vx_i32m4(_in, offset, vl);
         vfloat32m4_t _f32 = vfcvt_f_x_v_f32m4(_i32, vl);
-        _f32 = vfmul_vv_f32m4(_f32, _s, vl);
+        _f32 = vfmul_vf_f32m4(_f32, *scale, vl);
         vse32_v_f32m4(output, _f32, vl);
         output += vl;
         length -= vl;
@@ -696,16 +1097,14 @@ void shl_rvv_i16_to_f32(const int16_t *input, float *output, int32_t offset, flo
 void shl_rvv_f32_to_i16(const float *input, int16_t *output, int32_t offset, float *scale,
                         uint32_t length)
 {
-    int vl = vsetvl_e32m4(length);
-    vint32m4_t _z = vmv_v_x_i32m4(offset, vl);
-    vfloat32m4_t _1_s = vfmv_v_f_f32m4(1 / *scale, vl);
+    float _1_s = 1 / *scale;
     while (length > 0) {
-        vl = vsetvl_e16m2(length);
+        int vl = vsetvl_e16m2(length);
         vfloat32m4_t _in = vle32_v_f32m4(input, vl);
         input += vl;
-        vfloat32m4_t _f32 = vfmul_vv_f32m4(_in, _1_s, vl);
+        vfloat32m4_t _f32 = vfmul_vf_f32m4(_in, _1_s, vl);
         vint32m4_t _i32 = vfcvt_x_f_v_i32m4(_f32, vl);
-        _i32 = vadd_vv_i32m4(_i32, _z, vl);
+        _i32 = vadd_vx_i32m4(_i32, offset, vl);
         vint16m2_t _i16 = vnclip_wx_i16m2(_i32, 0, vl);
         vse16_v_i16m2(output, _i16, vl);
         output += vl;
@@ -774,23 +1173,21 @@ void shl_rvv_f32_to_i64(const float *input, int64_t *output, uint32_t length)
 
 void shl_rvv_f16_to_f32(const __fp16 *input, float *output, float *scale, uint32_t length)
 {
-    int vl = vsetvl_e32m4(length);
     if (fabs(*scale - 1) > FLT_EPSILON) {
-        vfloat32m4_t _s = vfmv_v_f_f32m4(*scale, vl);
         while (length > 0) {
-            vl = vsetvl_e16m2(length);
+            int vl = vsetvl_e16m2(length);
             vfloat16m2_t _f16 = vle16_v_f16m2(input, vl);
             input += vl;
             vfloat32m4_t _f32 = vfwcvt_f_f_v_f32m4(_f16, vl);
             // dequantize
-            _f32 = vfmul_vv_f32m4(_f32, _s, vl);
+            _f32 = vfmul_vf_f32m4(_f32, *scale, vl);
             vse32_v_f32m4(output, _f32, vl);
             output += vl;
             length -= vl;
         }
     } else {
         while (length > 0) {
-            vl = vsetvl_e16m2(length);
+            int vl = vsetvl_e16m2(length);
             vfloat16m2_t _f16 = vle16_v_f16m2(input, vl);
             input += vl;
             vfloat32m4_t _f32 = vfwcvt_f_f_v_f32m4(_f16, vl);
@@ -803,15 +1200,14 @@ void shl_rvv_f16_to_f32(const __fp16 *input, float *output, float *scale, uint32
 
 void shl_rvv_f32_to_f16(const float *input, __fp16 *output, float *scale, uint32_t length)
 {
-    int vl = vsetvl_e32m4(length);
     if (fabs(*scale - 1) > FLT_EPSILON) {
-        vfloat32m4_t _1_s = vfmv_v_f_f32m4(1 / *scale, vl);
+        float _1_s = 1 / *scale;
         while (length > 0) {
-            vl = vsetvl_e32m4(length);
+            int vl = vsetvl_e32m4(length);
             vfloat32m4_t _f32 = vle32_v_f32m4(input, vl);
             input += vl;
             // quantize
-            _f32 = vfmul_vv_f32m4(_f32, _1_s, vl);
+            _f32 = vfmul_vf_f32m4(_f32, _1_s, vl);
             vfloat16m2_t _f16 = vfncvt_f_f_w_f16m2(_f32, vl);
             vse16_v_f16m2(output, _f16, vl);
             output += vl;
@@ -819,7 +1215,7 @@ void shl_rvv_f32_to_f16(const float *input, __fp16 *output, float *scale, uint32
         }
     } else {
         while (length > 0) {
-            vl = vsetvl_e32m4(length);
+            int vl = vsetvl_e32m4(length);
             vfloat32m4_t _f32 = vle32_v_f32m4(input, vl);
             input += vl;
             vfloat16m2_t _f16 = vfncvt_f_f_w_f16m2(_f32, vl);
@@ -916,3 +1312,572 @@ int shl_rvv_transpose_get_out_index(int32_t *dim, int32_t *idx, int32_t *permute
     }
     return res;
 }
+
+static int rvv_tensor_dtype_convert(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    if (dst->quant_channel > 1 || src->quant_channel > 1) {
+        shl_debug_error("Unsupported channel quantization!\n");
+        return CSINN_FALSE;
+    }
+
+    if (src->dtype == CSINN_DTYPE_FLOAT32 && dst->dtype == CSINN_DTYPE_UINT8) {
+        memcpy(dst->data, src->data, csinn_tensor_byte_size(dst));
+        return CSINN_TRUE;
+    }
+
+    uint32_t size = csinn_tensor_size(dst);
+    if (dst->dtype == CSINN_DTYPE_FLOAT32) {
+        float scale = src->qinfo->scale;
+        int32_t zero_point = src->qinfo->zero_point;
+        if (src->dtype == CSINN_DTYPE_UINT8) {
+            shl_rvv_u8_to_f32(src->data, dst->data, zero_point, &scale, size);
+        } else if (src->dtype == CSINN_DTYPE_INT8) {
+            shl_rvv_i8_to_f32(src->data, dst->data, zero_point, &scale, size);
+        } else if (src->dtype == CSINN_DTYPE_INT16) {
+            shl_rvv_i16_to_f32(src->data, dst->data, zero_point, &scale, size);
+        } else if (src->dtype == CSINN_DTYPE_INT32) {
+            shl_rvv_i32_to_f32(src->data, dst->data, zero_point, &scale, size);
+        } else if (src->dtype == CSINN_DTYPE_INT64) {
+            shl_rvv_i64_to_f32(src->data, dst->data, size);
+        } else if (src->dtype == CSINN_DTYPE_FLOAT16) {
+            shl_rvv_f16_to_f32(src->data, dst->data, &scale, size);
+        } else {
+            shl_debug_error("Unsupported convert dtype from %d to %d\n", src->dtype, dst->dtype);
+            return CSINN_UNSUPPORT_DTYPE;
+        }
+    } else if (src->dtype == CSINN_DTYPE_FLOAT32) {
+        float scale = dst->qinfo->scale;
+        int32_t zero_point = dst->qinfo->zero_point;
+        if (dst->dtype == CSINN_DTYPE_UINT8) {
+            shl_rvv_f32_to_u8(src->data, dst->data, zero_point, &scale, size);
+        } else if (dst->dtype == CSINN_DTYPE_INT8) {
+            shl_rvv_f32_to_i8(src->data, dst->data, zero_point, &scale, size);
+        } else if (dst->dtype == CSINN_DTYPE_INT16) {
+            shl_rvv_f32_to_i16(src->data, dst->data, zero_point, &scale, size);
+        } else if (dst->dtype == CSINN_DTYPE_INT32) {
+            shl_rvv_f32_to_i32(src->data, dst->data, zero_point, &scale, size);
+        } else if (dst->dtype == CSINN_DTYPE_INT64) {
+            shl_rvv_f32_to_i64(src->data, dst->data, size);
+        } else if (dst->dtype == CSINN_DTYPE_FLOAT16) {
+            shl_rvv_f32_to_f16(src->data, dst->data, &scale, size);
+        } else {
+            shl_debug_error("Unsupported convert dtype from %d to %d\n", src->dtype, dst->dtype);
+            return CSINN_UNSUPPORT_DTYPE;
+        }
+    } else if (src->dtype == CSINN_DTYPE_UINT8 && dst->dtype == CSINN_DTYPE_INT16) {
+        shl_rvv_u8_to_i16(src->data, dst->data, src->qinfo->zero_point, &src->qinfo->scale,
+                          dst->qinfo->zero_point, &dst->qinfo->scale, size);
+    } else if (src->dtype == CSINN_DTYPE_INT16 && dst->dtype == CSINN_DTYPE_UINT8) {
+        shl_rvv_i16_to_u8(src->data, dst->data, src->qinfo->zero_point, &src->qinfo->scale,
+                          dst->qinfo->zero_point, &dst->qinfo->scale, size);
+    } else {
+        shl_debug_error("Unsupported convert dtype from %d to %d\n", src->dtype, dst->dtype);
+        return CSINN_UNSUPPORT_DTYPE;
+    }
+
+    return CSINN_TRUE;
+}
+
+static void rvv_ncx_to_nc1xc0_fp32(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int in_c = src->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < src->dim_count; i++) {
+        inner_size *= src->dim[i];
+    }
+
+    float *src_data = src->data;
+    float *dst_data = dst->data;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    int vl = vsetvl_e32m1(packn);
+    int batch_size = in_c * inner_size;
+
+    float *out_ptr = dst_data;
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            float *in_ptr = src_data + b * batch_size + c * inner_size;
+            for (int i = 0; i < inner_size; i++) {
+                vfloat32m1_t _tmp = vlse32_v_f32m1(in_ptr, inner_size * sizeof(float), vl);
+                in_ptr++;
+                vse32_v_f32m1(out_ptr, _tmp, vl);
+                out_ptr += vl;
+            }
+        }
+    }
+}
+
+static void rvv_ncx_to_nc1xc0_fp16(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int in_c = src->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < src->dim_count; i++) {
+        inner_size *= src->dim[i];
+    }
+
+    __fp16 *src_data = src->data;
+    __fp16 *dst_data = dst->data;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int vl = vsetvl_e16m1(packn);
+    int batch_size = in_c * inner_size;
+
+    __fp16 *out_ptr = dst_data;
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            __fp16 *in_ptr = src_data + b * batch_size + c * inner_size;
+            for (int i = 0; i < inner_size; i++) {
+                vfloat16m1_t _tmp = vlse16_v_f16m1(in_ptr, inner_size * sizeof(__fp16), vl);
+                in_ptr++;
+                vse16_v_f16m1(out_ptr, _tmp, vl);
+                out_ptr += vl;
+            }
+        }
+    }
+}
+
+static void rvv_ncx_to_nc1xc0_int8(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int in_c = src->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < src->dim_count; i++) {
+        inner_size *= src->dim[i];
+    }
+
+    int8_t *src_data = src->data;
+    int8_t *dst_data = dst->data;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8m1(packn);
+    int batch_size = in_c * inner_size;
+
+    int8_t *out_ptr = dst_data;
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            int8_t *in_ptr = src_data + b * batch_size + c * inner_size;
+            for (int i = 0; i < inner_size; i++) {
+                vint8m1_t _tmp = vlse8_v_i8m1(in_ptr, inner_size * sizeof(int8_t), vl);
+                in_ptr++;
+                vse8_v_i8m1(out_ptr, _tmp, vl);
+                out_ptr += vl;
+            }
+        }
+    }
+}
+
+static void rvv_nc1xc0_to_ncx_fp32(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int in_c1 = src->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < src->dim_count - 1; i++) {
+        inner_size *= src->dim[i];
+    }
+    int in_elempack = src->dim[src->dim_count - 1];
+
+    float *src_data = src->data;
+    float *dst_data = dst->data;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    int vl = vsetvl_e32m1(packn);
+    int batch_size = in_c1 * inner_size * in_elempack;
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < in_c1; c++) {
+            float *out_ptr = dst_data + b * batch_size + c * inner_size * in_elempack;
+            for (int i = 0; i < inner_size; i++) {
+                vfloat32m1_t _tmp = vle32_v_f32m1(src_data, vl);
+                src_data += vl;
+                vsse32_v_f32m1(out_ptr, inner_size * sizeof(float), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+static void rvv_nc1xc0_to_ncx_fp16(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int in_c1 = src->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < src->dim_count - 1; i++) {
+        inner_size *= src->dim[i];
+    }
+    int in_elempack = src->dim[src->dim_count - 1];
+
+    __fp16 *src_data = src->data;
+    __fp16 *dst_data = dst->data;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int vl = vsetvl_e16m1(packn);
+    int batch_size = in_c1 * inner_size * in_elempack;
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < in_c1; c++) {
+            __fp16 *out_ptr = dst_data + b * batch_size + c * inner_size * in_elempack;
+            for (int i = 0; i < inner_size; i++) {
+                vfloat16m1_t _tmp = vle16_v_f16m1(src_data, vl);
+                src_data += vl;
+                vsse16_v_f16m1(out_ptr, inner_size * sizeof(__fp16), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+static void rvv_nc1xc0_to_ncx_int8(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int in_c1 = src->dim[1];
+    int inner_size = 1;
+    for (int i = 2; i < src->dim_count - 1; i++) {
+        inner_size *= src->dim[i];
+    }
+    int in_elempack = src->dim[src->dim_count - 1];
+
+    int8_t *src_data = src->data;
+    int8_t *dst_data = dst->data;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8m1(packn);
+    int batch_size = in_c1 * inner_size * in_elempack;
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < in_c1; c++) {
+            int8_t *out_ptr = dst_data + b * batch_size + c * inner_size * in_elempack;
+            for (int i = 0; i < inner_size; i++) {
+                vint8m1_t _tmp = vle8_v_i8m1(src_data, vl);
+                src_data += vl;
+                vsse8_v_i8m1(out_ptr, inner_size * sizeof(int8_t), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+static void rvv_ncx_to_nxc_fp32(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int outer_size;
+    int inner_size;
+    if ((src->layout == CSINN_LAYOUT_NCDHW && dst->layout == CSINN_LAYOUT_NDHWC) ||
+        (src->layout == CSINN_LAYOUT_NCHW && dst->layout == CSINN_LAYOUT_NHWC) ||
+        (src->layout == CSINN_LAYOUT_NCW && dst->layout == CSINN_LAYOUT_NWC)) {
+        for (int i = 2; i < src->dim_count - 1; i++) {
+            inner_size *= src->dim[i];
+        }
+        outer_size = src->dim[src->dim_count - 1];
+    } else if ((src->layout == CSINN_LAYOUT_NDHWC && dst->layout == CSINN_LAYOUT_NCDHW) ||
+               (src->layout == CSINN_LAYOUT_NHWC && dst->layout == CSINN_LAYOUT_NCHW) ||
+               (src->layout == CSINN_LAYOUT_NWC && dst->layout == CSINN_LAYOUT_NCW)) {
+        for (int i = 2; i < src->dim_count - 1; i++) {
+            outer_size *= src->dim[i];
+        }
+        inner_size = src->dim[src->dim_count - 1];
+    }
+
+    float *src_data = src->data;
+    float *dst_data = dst->data;
+
+    for (int b = 0; b < batch; b++) {
+        for (int i = 0; i < outer_size; i++) {
+            int size = inner_size;
+            float *d_ptr = dst_data + i;
+            while (size > 0) {
+                int vl = vsetvl_e32m4(size);
+                vfloat32m4_t _in = vle32_v_f32m4(src_data, vl);
+                src_data += vl;
+                vsse32_v_f32m4(d_ptr, outer_size * sizeof(float), _in, vl);
+                d_ptr += vl * outer_size;
+                size -= vl;
+            }
+        }
+        dst_data += inner_size * outer_size;
+    }
+}
+
+static void rvv_ncx_to_nxc_fp16(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int outer_size;
+    int inner_size;
+    if ((src->layout == CSINN_LAYOUT_NCDHW && dst->layout == CSINN_LAYOUT_NDHWC) ||
+        (src->layout == CSINN_LAYOUT_NCHW && dst->layout == CSINN_LAYOUT_NHWC) ||
+        (src->layout == CSINN_LAYOUT_NCW && dst->layout == CSINN_LAYOUT_NWC)) {
+        for (int i = 2; i < src->dim_count - 1; i++) {
+            inner_size *= src->dim[i];
+        }
+        outer_size = src->dim[src->dim_count - 1];
+    } else if ((src->layout == CSINN_LAYOUT_NDHWC && dst->layout == CSINN_LAYOUT_NCDHW) ||
+               (src->layout == CSINN_LAYOUT_NHWC && dst->layout == CSINN_LAYOUT_NCHW) ||
+               (src->layout == CSINN_LAYOUT_NWC && dst->layout == CSINN_LAYOUT_NCW)) {
+        for (int i = 2; i < src->dim_count - 1; i++) {
+            outer_size *= src->dim[i];
+        }
+        inner_size = src->dim[src->dim_count - 1];
+    }
+
+    __fp16 *src_data = src->data;
+    __fp16 *dst_data = dst->data;
+
+    for (int b = 0; b < batch; b++) {
+        for (int i = 0; i < outer_size; i++) {
+            int size = inner_size;
+            __fp16 *d_ptr = dst_data + i;
+            while (size > 0) {
+                int vl = vsetvl_e16m4(size);
+                vfloat16m4_t _in = vle16_v_f16m4(src_data, vl);
+                src_data += vl;
+                vsse16_v_f16m4(d_ptr, outer_size * sizeof(__fp16), _in, vl);
+                d_ptr += vl * outer_size;
+                size -= vl;
+            }
+        }
+        dst_data += inner_size * outer_size;
+    }
+}
+
+static void rvv_ncx_to_nxc_int8(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    int batch = src->dim[0];
+    int outer_size;
+    int inner_size;
+    if ((src->layout == CSINN_LAYOUT_NCDHW && dst->layout == CSINN_LAYOUT_NDHWC) ||
+        (src->layout == CSINN_LAYOUT_NCHW && dst->layout == CSINN_LAYOUT_NHWC) ||
+        (src->layout == CSINN_LAYOUT_NCW && dst->layout == CSINN_LAYOUT_NWC)) {
+        for (int i = 2; i < src->dim_count - 1; i++) {
+            inner_size *= src->dim[i];
+        }
+        outer_size = src->dim[src->dim_count - 1];
+    } else if ((src->layout == CSINN_LAYOUT_NDHWC && dst->layout == CSINN_LAYOUT_NCDHW) ||
+               (src->layout == CSINN_LAYOUT_NHWC && dst->layout == CSINN_LAYOUT_NCHW) ||
+               (src->layout == CSINN_LAYOUT_NWC && dst->layout == CSINN_LAYOUT_NCW)) {
+        for (int i = 2; i < src->dim_count - 1; i++) {
+            outer_size *= src->dim[i];
+        }
+        inner_size = src->dim[src->dim_count - 1];
+    }
+
+    int8_t *src_data = src->data;
+    int8_t *dst_data = dst->data;
+
+    for (int b = 0; b < batch; b++) {
+        for (int i = 0; i < outer_size; i++) {
+            int size = inner_size;
+            int8_t *d_ptr = dst_data + i;
+            while (size > 0) {
+                int vl = vsetvl_e8m4(size);
+                vint8m4_t _in = vle8_v_i8m4(src_data, vl);
+                src_data += vl;
+                vsse8_v_i8m4(d_ptr, outer_size * sizeof(int8_t), _in, vl);
+                d_ptr += vl * outer_size;
+                size -= vl;
+            }
+        }
+        dst_data += inner_size * outer_size;
+    }
+}
+
+static int rvv_tensor_layout_convert(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    if ((src->layout == CSINN_LAYOUT_NC1DHWC0 && dst->layout == CSINN_LAYOUT_NCDHW) ||
+        (src->layout == CSINN_LAYOUT_NC1HWC0 && dst->layout == CSINN_LAYOUT_NCHW) ||
+        (src->layout == CSINN_LAYOUT_NC1WC0 && dst->layout == CSINN_LAYOUT_NCW) ||
+        (src->layout == CSINN_LAYOUT_NC1C0 && dst->layout == CSINN_LAYOUT_NC)) {
+        if (src->dtype == CSINN_DTYPE_FLOAT32) {
+            rvv_nc1xc0_to_ncx_fp32(src, dst);
+        } else if (src->dtype == CSINN_DTYPE_FLOAT16) {
+            rvv_nc1xc0_to_ncx_fp16(src, dst);
+        } else if (src->dtype == CSINN_DTYPE_INT8) {
+            rvv_nc1xc0_to_ncx_int8(src, dst);
+        } else {
+            shl_debug_error("Unsupported dtype from %d to %d during ndarray_to_nc1xc0 conversion\n",
+                            src->dtype, dst->dtype);
+            return CSINN_UNSUPPORT_DTYPE;
+        }
+    } else if ((src->layout == CSINN_LAYOUT_NCDHW && dst->layout == CSINN_LAYOUT_NC1DHWC0) ||
+               (src->layout == CSINN_LAYOUT_NCHW && dst->layout == CSINN_LAYOUT_NC1HWC0) ||
+               (src->layout == CSINN_LAYOUT_NCW && dst->layout == CSINN_LAYOUT_NC1WC0) ||
+               (src->layout == CSINN_LAYOUT_NC && dst->layout == CSINN_LAYOUT_NC1C0)) {
+        if (dst->dtype == CSINN_DTYPE_FLOAT32) {
+            rvv_ncx_to_nc1xc0_fp32(src, dst);
+        } else if (dst->dtype == CSINN_DTYPE_FLOAT16) {
+            rvv_ncx_to_nc1xc0_fp16(src, dst);
+        } else if (dst->dtype == CSINN_DTYPE_INT8) {
+            rvv_ncx_to_nc1xc0_int8(src, dst);
+        } else {
+            shl_debug_error("Unsupported dtype from %d to %d during nc1xc0_to_ndarray conversion\n",
+                            src->dtype, dst->dtype);
+            return CSINN_UNSUPPORT_DTYPE;
+        }
+    } else if ((src->layout == CSINN_LAYOUT_NCDHW && dst->layout == CSINN_LAYOUT_NDHWC) ||
+               (src->layout == CSINN_LAYOUT_NCHW && dst->layout == CSINN_LAYOUT_NHWC) ||
+               (src->layout == CSINN_LAYOUT_NCW && dst->layout == CSINN_LAYOUT_NWC) ||
+               (src->layout == CSINN_LAYOUT_NDHWC && dst->layout == CSINN_LAYOUT_NCDHW) ||
+               (src->layout == CSINN_LAYOUT_NHWC && dst->layout == CSINN_LAYOUT_NCHW) ||
+               (src->layout == CSINN_LAYOUT_NWC && dst->layout == CSINN_LAYOUT_NCW)) {
+        if (dst->dtype == CSINN_DTYPE_FLOAT32) {
+            rvv_ncx_to_nxc_fp32(src, dst);
+        } else if (dst->dtype == CSINN_DTYPE_FLOAT16) {
+            rvv_ncx_to_nxc_fp16(src, dst);
+        } else if (dst->dtype == CSINN_DTYPE_INT8) {
+            rvv_ncx_to_nxc_int8(src, dst);
+        } else {
+            shl_debug_error("Unsupported dtype from %d to %d during layout conversion\n",
+                            src->dtype, dst->dtype);
+            return CSINN_UNSUPPORT_DTYPE;
+        }
+    } else {
+        shl_debug_error("Unsupported convert layout from %d to %d\n", src->layout, dst->layout);
+        return CSINN_UNSUPPORT_LAYOUT;
+    }
+    return CSINN_TRUE;
+}
+
+static int rvv_tensor_layout_dtype_convert(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    struct csinn_tensor *tmp = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(tmp, src);
+    tmp->data = shl_mem_alloc(csinn_tensor_byte_size(src));
+
+    if ((src->layout == CSINN_LAYOUT_NC1DHWC0 && dst->layout == CSINN_LAYOUT_NCDHW) ||
+        (src->layout == CSINN_LAYOUT_NC1HWC0 && dst->layout == CSINN_LAYOUT_NCHW) ||
+        (src->layout == CSINN_LAYOUT_NC1WC0 && dst->layout == CSINN_LAYOUT_NCW) ||
+        (src->layout == CSINN_LAYOUT_NC1C0 && dst->layout == CSINN_LAYOUT_NC)) {
+        tmp->layout = dst->layout;
+        tmp->dtype = src->dtype;
+        int ret1 = rvv_tensor_layout_convert(src, tmp);
+        int ret2 = rvv_tensor_dtype_convert(tmp, dst);
+        return (ret1 == CSINN_TRUE && ret2 == CSINN_TRUE) ? CSINN_TRUE : CSINN_FALSE;
+    } else if ((src->layout == CSINN_LAYOUT_NCDHW && dst->layout == CSINN_LAYOUT_NC1DHWC0) ||
+               (src->layout == CSINN_LAYOUT_NCHW && dst->layout == CSINN_LAYOUT_NC1HWC0) ||
+               (src->layout == CSINN_LAYOUT_NCW && dst->layout == CSINN_LAYOUT_NC1WC0) ||
+               (src->layout == CSINN_LAYOUT_NC && dst->layout == CSINN_LAYOUT_NC1C0)) {
+        tmp->dtype = dst->dtype;
+        tmp->layout = src->layout;
+        int ret1 = rvv_tensor_dtype_convert(src, tmp);
+        int ret2 = rvv_tensor_layout_convert(tmp, dst);
+        return (ret1 == CSINN_TRUE && ret2 == CSINN_TRUE) ? CSINN_TRUE : CSINN_FALSE;
+    } else if ((src->layout == CSINN_LAYOUT_NCDHW && dst->layout == CSINN_LAYOUT_NDHWC) ||
+               (src->layout == CSINN_LAYOUT_NCHW && dst->layout == CSINN_LAYOUT_NHWC) ||
+               (src->layout == CSINN_LAYOUT_NCW && dst->layout == CSINN_LAYOUT_NWC) ||
+               (src->layout == CSINN_LAYOUT_NDHWC && dst->layout == CSINN_LAYOUT_NCDHW) ||
+               (src->layout == CSINN_LAYOUT_NHWC && dst->layout == CSINN_LAYOUT_NCHW) ||
+               (src->layout == CSINN_LAYOUT_NWC && dst->layout == CSINN_LAYOUT_NCW)) {
+        tmp->dtype = dst->dtype;
+        tmp->layout = src->layout;
+        int ret1 = rvv_tensor_dtype_convert(src, tmp);
+        int ret2 = rvv_tensor_layout_convert(tmp, dst);
+    } else {
+        shl_debug_error("Unsupported convert layout from %d to %d, dtype from %d to %d\n",
+                        src->layout, dst->layout, src->dtype, dst->dtype);
+        return CSINN_FALSE;
+    }
+
+    shl_mem_free(tmp->data);
+    csinn_free_tensor(tmp);
+    return CSINN_TRUE;
+}
+
+int shl_rvv_tensor_data_convert(struct csinn_tensor *src, struct csinn_tensor *dst)
+{
+    if (dst->layout == src->layout && dst->dtype == src->dtype) {
+        memcpy(dst->data, src->data, csinn_tensor_byte_size(dst));
+        return CSINN_TRUE;
+    } else if (dst->layout == src->layout && dst->dtype != src->dtype) {
+        return rvv_tensor_dtype_convert(src, dst);
+    } else if (dst->layout != src->layout && dst->dtype == src->dtype) {
+        return rvv_tensor_layout_convert(src, dst);
+    } else {
+        // dst->layout != src->layout && dst->dtype != src->dtype
+        return rvv_tensor_layout_dtype_convert(src, dst);
+    }
+}
+
+struct csinn_tensor *shl_rvv_tensor_transform_f32(struct csinn_tensor *input)
+{
+    struct csinn_tensor *ret = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ret, input);
+    if (ret->qinfo != NULL) {
+        shl_mem_free(ret->qinfo);
+        ret->qinfo = NULL;
+    }
+    ret->quant_channel = 0;
+    ret->dtype = CSINN_DTYPE_FLOAT32;
+    switch (input->layout) {
+        case CSINN_LAYOUT_NC1DHWC0:
+            ret->layout = CSINN_LAYOUT_NCDHW;
+            ret->dim[1] *= input->dim[5];
+            ret->dim[5] = 0;
+            ret->dim_count = 5;
+            break;
+        case CSINN_LAYOUT_NC1HWC0:
+            ret->layout = CSINN_LAYOUT_NCHW;
+            ret->dim[1] *= input->dim[4];
+            ret->dim[4] = 0;
+            ret->dim_count = 4;
+            break;
+        case CSINN_LAYOUT_NC1WC0:
+            ret->layout = CSINN_LAYOUT_NCW;
+            ret->dim[1] *= input->dim[3];
+            ret->dim[3] = 0;
+            ret->dim_count = 3;
+            break;
+        case CSINN_LAYOUT_NC1C0:
+            ret->layout = CSINN_LAYOUT_NC;
+            ret->dim[1] *= input->dim[2];
+            ret->dim[2] = 0;
+            ret->dim_count = 2;
+            break;
+        default:
+            break;
+    }
+    if (ret->dim_count == 0) {
+        return ret;
+    }
+    int input_size = csinn_tensor_size(input);
+    if (input_size == 0) {
+        return ret;
+    }
+    ret->data = shl_mem_alloc(input_size * sizeof(float));
+    if (shl_rvv_tensor_data_convert(input, ret) == CSINN_TRUE) {
+        return ret;
+    } else {
+        shl_mem_free(ret->data);
+        csinn_free_tensor(ret);
+        return NULL;
+    }
+}
+
+int shl_rvv_siso_callback_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                               void *params, void *cb)
+{
+    int (*callback)() = cb;
+    struct csinn_tensor *finput = shl_rvv_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_rvv_tensor_transform_f32(output);
+    if (finput == NULL) {
+        shl_debug_warning(
+            "shl_rvv_tensor_transform_f32 is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        finput = shl_ref_tensor_transform_f32(input);
+    }
+    if (foutput == NULL) {
+        shl_debug_warning(
+            "shl_rvv_tensor_transform_f32 is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        foutput = shl_ref_tensor_transform_f32(output);
+    }
+    int ret = callback(finput, foutput, params);
+    if (shl_rvv_tensor_data_convert(foutput, output) != CSINN_TRUE) {
+        shl_debug_warning(
+            "shl_rvv_tensor_data_convert is not optimized to achieve under this condition on RVV, "
+            "call reference func replaced.\n");
+        csinn_tensor_data_convert(output, foutput);
+    }
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
+    return ret;
+}
diff --git a/source/tvm_gen/setup.c b/source/tvm_gen/setup.c
index 89db4675..bf9b6634 100644
--- a/source/tvm_gen/setup.c
+++ b/source/tvm_gen/setup.c
@@ -17,8 +17,8 @@
  */
 
 #include "csi_nn.h"
-#include "shl_tvmgen.h"
 #include "shl_utils.h"
+#include "tvmgen/shl_tvmgen.h"
 
 struct shl_tvmgen_name_func_map {
     int size;
diff --git a/source/tvm_gen/utils.c b/source/tvm_gen/utils.c
index 0d2c49a3..1c67dcab 100644
--- a/source/tvm_gen/utils.c
+++ b/source/tvm_gen/utils.c
@@ -17,7 +17,7 @@
  */
 
 #include "dlpack/dlpack.h"
-#include "shl_tvmgen.h"
+#include "tvmgen/shl_tvmgen.h"
 
 static DLTensor *tensor_to_dltensor(struct csinn_tensor *tensor)
 {
diff --git a/source/utils/debug.c b/source/utils/debug.c
index 9f32b6b9..bcb2f92a 100644
--- a/source/utils/debug.c
+++ b/source/utils/debug.c
@@ -23,8 +23,8 @@
 #include <stdio.h>
 #include <sys/stat.h>
 
+#include "reference/ref.h"
 #include "shl_debug.h"
-#include "shl_ref.h"
 
 int shl_debug_level = SHL_DEBUG_LEVEL_WARNING;
 
@@ -897,8 +897,8 @@ int shl_where_debug_info(struct csinn_tensor *condition, struct csinn_tensor *x,
 }
 
 int shl_where_softmax_debug_info(struct csinn_tensor *condition, struct csinn_tensor *y,
-                                 struct csinn_tensor *output, struct csinn_where_softmax_params *params,
-                                 const char *name)
+                                 struct csinn_tensor *output,
+                                 struct csinn_where_softmax_params *params, const char *name)
 {
     shl_debug_print_diso_base(condition, y, output, &(params->base), name);
     shl_debug_info("axis=%d", params->axis);
@@ -983,6 +983,7 @@ char *op_strings[] = {
     [CSINN_OP_WHERE_SOFTMAX] = "where_softmax",
     [CSINN_OP_ERF] = "erf",
     [CSINN_OP_CAST] = "cast",
+    [CSINN_OP_DECONV2D] = "deconv2d",
 };
 
 // #define FREQ 50  // FPGA: 50MHz
@@ -1019,6 +1020,10 @@ int shl_benchmark_layer(struct shl_node *n, uint64_t start_time, uint64_t end_ti
             k_h = in1->dim[2];
             k_w = in1->dim[3];
             in_c = in1->dim[1];
+        } else if (in1->layout == CSINN_LAYOUT_IOHW) {
+            k_h = in1->dim[2];
+            k_w = in1->dim[3];
+            in_c = in1->dim[0];
         } else if (in1->layout == CSINN_LAYOUT_OHWI) {
             k_h = in1->dim[1];
             k_w = in1->dim[2];
diff --git a/source/utils/export.c b/source/utils/export.c
new file mode 100644
index 00000000..34150ac3
--- /dev/null
+++ b/source/utils/export.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef SHL_EXPORT_MODEL
+
+#include "csi_nn.h"
+#include "export_json_wrapper.h"
+
+int shl_export_model_json(struct csinn_session *sess, char *path)
+{
+    int ret = shl_export_json_internal(sess, path);
+    return ret;
+}
+
+#endif
\ No newline at end of file
diff --git a/source/utils/export_json_wrapper.cpp b/source/utils/export_json_wrapper.cpp
new file mode 100644
index 00000000..292f5283
--- /dev/null
+++ b/source/utils/export_json_wrapper.cpp
@@ -0,0 +1,688 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef SHL_EXPORT_MODEL
+
+#include "export_json_wrapper.h"
+extern "C" {
+#include "csi_nn.h"
+#include "shl_gref.h"
+}
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#include "../../module/json/json.hpp"
+using json = nlohmann::ordered_json;
+
+NLOHMANN_JSON_SERIALIZE_ENUM(csinn_mem_type_enum,
+                             {
+                                 {CSINN_MEM_TYPE_CPU_NOT_ALIGNED, "CSINN_MEM_TYPE_CPU_NOT_ALIGNED"},
+                                 {CSINN_MEM_TYPE_CPU_ALIGNED, "CSINN_MEM_TYPE_CPU_ALIGNED"},
+                                 {CSINN_MEM_TYPE_DMABUF, "CSINN_MEM_TYPE_DMABUF"},
+                                 {CSINN_MEM_TYPE_ASP42, "CSINN_MEM_TYPE_ASP42"},
+                                 {CSINN_MEM_TYPE_ASP41, "CSINN_MEM_TYPE_ASP41"},
+                                 {CSINN_MEM_TYPE_CPU_ACC, "CSINN_MEM_TYPE_CPU_ACC"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(
+    csinn_dtype_enum,
+    {
+        {CSINN_DTYPE_BOOL, "CSINN_DTYPE_BOOL"},         /**< Boolean */
+        {CSINN_DTYPE_INT4, "CSINN_DTYPE_INT4"},         /**< Signed 4 bit fixed-point */
+        {CSINN_DTYPE_UINT8, "CSINN_DTYPE_UINT8"},       /**< Unsigned 8 bit fixed-point */
+        {CSINN_DTYPE_INT8, "CSINN_DTYPE_INT8"},         /**< Signed 8 bit fixed-point */
+        {CSINN_DTYPE_UINT16, "CSINN_DTYPE_UINT16"},     /**< Unsigned 16 bit fixed-point */
+        {CSINN_DTYPE_INT16, "CSINN_DTYPE_INT16"},       /**< Signed 16 bit fixed-point */
+        {CSINN_DTYPE_UINT32, "CSINN_DTYPE_UINT32"},     /**< Unsigned 32 bit fixed-point */
+        {CSINN_DTYPE_INT32, "CSINN_DTYPE_INT32"},       /**< Signed 32 bit fixed-point */
+        {CSINN_DTYPE_FLOAT16, "CSINN_DTYPE_FLOAT16"},   /**< Half-precision floating-point */
+        {CSINN_DTYPE_BFLOAT16, "CSINN_DTYPE_BFLOAT16"}, /**< Brain floating-point */
+        {CSINN_DTYPE_FLOAT32, "CSINN_DTYPE_FLOAT32"},   /**< Single-precision floating-point */
+        {CSINN_DTYPE_FLOAT64, "CSINN_DTYPE_FLOAT64"},   /**< Double-precision floating-point */
+        {CSINN_DTYPE_INT64, "CSINN_DTYPE_INT64"},       /**< Signed 64 bit fixed-point */
+    })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(
+    csinn_quant_enum,
+    {
+        {CSINN_QUANT_UNSET, "CSINN_QUANT_UNSET"}, /**< The quantization type is not set */
+        {CSINN_QUANT_INT4_SYM,
+         "CSINN_QUANT_INT4_SYM"}, /**< Symmetric signed 4-bit fixed-point quantization */
+        {CSINN_QUANT_UINT8_ASYM,
+         "CSINN_QUANT_UINT8_ASYM"}, /**< Asymmetric unsigned 8-bit fixed-point quantization */
+        {CSINN_QUANT_UINT8_SYM,
+         "CSINN_QUANT_UINT8_SYM"}, /**< Symmetric unsigned 8-bit fixed-point quantization */
+        {CSINN_QUANT_INT8_ASYM,
+         "CSINN_QUANT_INT8_ASYM"}, /**< Asymmetric signed 8-bit fixed-point quantization */
+        {CSINN_QUANT_INT8_SYM,
+         "CSINN_QUANT_INT8_SYM"}, /**< Symmetric signed 8-bit fixed-point quantization */
+        {CSINN_QUANT_INT16_SYM,
+         "CSINN_QUANT_INT16_SYM"}, /**< Symmetric signed 16-bit fixed-point quantization */
+        {CSINN_QUANT_FLOAT16, "CSINN_QUANT_FLOAT16"},   /**< 16-bit floating-point quantization */
+        {CSINN_QUANT_BFLOAT16, "CSINN_QUANT_BFLOAT16"}, /**< bf16 floating-point quantization */
+        {CSINN_QUANT_FLOAT32, "CSINN_QUANT_FLOAT32"},   /**< 32-bit floating-point not quantized */
+        {CSINN_QUANT_INT4_ASYM_W_SYM,
+         "CSINN_QUANT_INT4_ASYM_W_SYM"}, /**< Signed 4-bit Asymmetric activation and Symmetric
+                                            weight */
+        {CSINN_QUANT_INT8_ASYM_W_SYM,
+         "CSINN_QUANT_INT8_ASYM_W_SYM"}, /**< Signed 8-bit Asymmetric activation and Symmetric
+                                            weight */
+        {CSINN_QUANT_FLOAT16_W_INT8,
+         "CSINN_QUANT_FLOAT16_W_INT8"}, /**< 16-bit floating-point and 8-bit symmetric weight */
+    })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(
+    csinn_api_enum, {
+                        {CSINN_REF, "CSINN_REF"},           /**< Reference c */
+                        {CSINN_GREF, "CSINN_GREF"},         /**< reference graph */
+                        {CSINN_C860, "CSINN_C860"},         /**< C860 CPU platform */
+                        {CSINN_C906, "CSINN_C906"},         /**< C906 CPU platform */
+                        {CSINN_C920, "CSINN_C920"},         /**< C920 CPU platform */
+                        {CSINN_ANOLE, "CSINN_ANOLE"},       /**< anole NPU platform */
+                        {CSINN_CH8601, "CSINN_CH8601"},     /**< ch8601 NPU platform */
+                        {CSINN_TH1520, "CSINN_TH1520"},     /**< th1520 NPU platform */
+                        {CSINN_DP1K, "CSINN_DP1K"},         /**< dp1000 NPU platform */
+                        {CSINN_I805, "CSINN_I805"},         /**< I805 CPU platform */
+                        {CSINN_E804, "CSINN_E804"},         /**< E804 CPU platform */
+                        {CSINN_REF_I805, "CSINN_REF_I805"}, /**< I805 CPU platform */
+                        {CSINN_C908, "CSINN_C908"},         /**< C908 CPU platform */
+                        {CSINN_TVMGEN, "CSINN_TVMGEN"},     /**< TVM generate platform */
+                        {CSINN_ASP, "CSINN_ASP"},           /**< ASP platform */
+                        {CSINN_RVV, "CSINN_RVV"},   /**< RISC-V V extension general platform */
+                        {CSINN_RVM, "CSINN_RVM"},   /**< RISC-V Matrix extension general platform */
+                        {CSINN_E907, "CSINN_E907"}, /**< E907 CPU platform */
+                    })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(
+    csinn_layout_enum,
+    {
+        {CSINN_LAYOUT_NULL, "CSINN_LAYOUT_NULL"}, /**< Not set */
+        // NCHW
+        // ACTIVITION
+        {CSINN_LAYOUT_N, "CSINN_LAYOUT_N"},         /**< NCHW input and output, 1 dimension */
+        {CSINN_LAYOUT_NC, "CSINN_LAYOUT_NC"},       /**< NCHW input and output, 2 dimensions */
+        {CSINN_LAYOUT_NCW, "CSINN_LAYOUT_NCW"},     /**< NCHW input and output, 3 dimensions */
+        {CSINN_LAYOUT_NCHW, "CSINN_LAYOUT_NCHW"},   /**< NCHW input and output, 4 dimensions */
+        {CSINN_LAYOUT_NCDHW, "CSINN_LAYOUT_NCDHW"}, /**< NCHW input and output, 5 dimensions */
+        // WEIGHT
+        {CSINN_LAYOUT_O, "CSINN_LAYOUT_O"},           /**< NCHW constant, 1 dimension */
+        {CSINN_LAYOUT_OI, "CSINN_LAYOUT_OI"},         /**< NCHW constant, 2 dimensions */
+        {CSINN_LAYOUT_O16I16, "CSINN_LAYOUT_O16I16"}, /**< 16 bytes in parallel for ASP platform */
+        {CSINN_LAYOUT_O32I32, "CSINN_LAYOUT_O32I32"}, /**< 32 bytes in parallel for ASP platform */
+        {CSINN_LAYOUT_OIW, "CSINN_LAYOUT_OIW"},       /**< NCHW constant, 3 dimension */
+        {CSINN_LAYOUT_OIHW, "CSINN_LAYOUT_OIHW"},     /**< NCHW constant, 4 dimension */
+        {CSINN_LAYOUT_IOHW, "CSINN_LAYOUT_IOHW"},     /**< NCHW constant, 4 dimension */
+        {CSINN_LAYOUT_OIDHW, "CSINN_LAYOUT_OIDHW"},   /**< NCHW constant, 5 dimension */
+        {CSINN_LAYOUT_O1HW, "CSINN_LAYOUT_O1HW"}, /**< NCHW constant, depthwise convolution only */
+
+        // NHWC
+        // ACTIVITION
+        {CSINN_LAYOUT_NWC, "CSINN_LAYOUT_NWC"},     /**< NHWC input and output, 3 dimensions */
+        {CSINN_LAYOUT_NHWC, "CSINN_LAYOUT_NHWC"},   /**< NHWC input and output, 4 dimensions */
+        {CSINN_LAYOUT_NDHWC, "CSINN_LAYOUT_NDHWC"}, /**< NHWC input and output, 5 dimensions */
+        // WEIGHT
+        {CSINN_LAYOUT_OWI, "CSINN_LAYOUT_OWI"},   /**< NHWC constant, 3 dimensions */
+        {CSINN_LAYOUT_OHWI, "CSINN_LAYOUT_OHWI"}, /**< NHWC constant, 4 dimensions */
+        {CSINN_LAYOUT_O16HWI16,
+         "CSINN_LAYOUT_O16HWI16"}, /**< 16 bytes in parallel for ASP platform */
+        {CSINN_LAYOUT_O32HWI32,
+         "CSINN_LAYOUT_O32HWI32"},                  /**< 32 bytes in parallel for ASP platform */
+        {CSINN_LAYOUT_ODHWI, "CSINN_LAYOUT_ODHWI"}, /**< NHWC constant, 5 dimensions */
+        {CSINN_LAYOUT_1HWO, "CSINN_LAYOUT_1HWO"}, /**< NHWC constant, depthwise convolution only */
+        {CSINN_LAYOUT_1HW16O16,
+         "CSINN_LAYOUT_1HW16O16"}, /**< 16 bytes in parallel for ASP platform */
+        {CSINN_LAYOUT_1HW32O32,
+         "CSINN_LAYOUT_1HW32O32"}, /**< 32 bytes in parallel for ASP platform */
+
+        // NC1HWC0
+        // ACTIVITION
+        // RVV optimization format: c0=4/8/8 for fp32/fp16/int8 when vlen=128
+        {CSINN_LAYOUT_NC1C0, "CSINN_LAYOUT_NC1C0"},   /**< NC1HWC0 input and output, 2 dimension */
+        {CSINN_LAYOUT_NC1WC0, "CSINN_LAYOUT_NC1WC0"}, /**< NC1HWC0 input and output, 3 dimension */
+        {CSINN_LAYOUT_NC1HWC0,
+         "CSINN_LAYOUT_NC1HWC0"}, /**< NC1HWC0 input and output, 4 dimension */
+        {CSINN_LAYOUT_NC1DHWC0,
+         "CSINN_LAYOUT_NC1DHWC0"}, /**< NC1HWC0 input and output, 5 dimension */
+
+        // for 6D shape
+        {CSINN_LAYOUT_NLCDHW, "CSINN_LAYOUT_NLCDHW"}, /**< NCHW input and output, 6 dimensions */
+    })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(
+    csinn_lrn_enum,
+    {
+        {CSINN_LRN_ACROSS_CHANNELS,
+         "CSINN_LRN_ACROSS_CHANNELS"}, /**< local response normalization across channels/channels */
+        {CSINN_LRN_WITHIN_CHANNEL,
+         "CSINN_LRN_WITHIN_CHANNEL"}, /**< local response normalization within the same channel */
+    })
+
+static json shl_export_json_tensor(struct csinn_tensor *tensor)
+{
+    nlohmann::ordered_map<std::string, json> tensor_j;
+    tensor_j["name"] = tensor->name;
+    tensor_j["dtype"] = tensor->dtype;
+    tensor_j["mtype"] = tensor->mtype;
+    std::vector<int32_t> dim(tensor->dim, tensor->dim + tensor->dim_count);
+    tensor_j["dim"] = dim;
+    tensor_j["is_const"] = tensor->is_const;
+    tensor_j["layout"] = (enum csinn_layout_enum)tensor->layout;
+
+    if (tensor->dtype != CSINN_DTYPE_FLOAT32 && tensor->dtype != CSINN_DTYPE_FLOAT64 &&
+        tensor->dtype != CSINN_DTYPE_INT64) {
+        tensor_j["quant_channel"] = tensor->quant_channel;
+        tensor_j["quant_info"] = {};
+        for (int i = 0; tensor->quant_channel; i++) {
+            json quant_info;
+            quant_info["scale"] = tensor->qinfo[i].scale;
+            quant_info["zero_point"] = tensor->qinfo[i].zero_point;
+            quant_info["multiplier"] = tensor->qinfo[i].multiplier;
+            quant_info["shift"] = tensor->qinfo[i].shift;
+            quant_info["min"] = tensor->qinfo[i].min;
+            quant_info["max"] = tensor->qinfo[i].max;
+
+            tensor_j["quant_info"].push_back(quant_info);
+        }
+    }
+
+    return tensor_j;
+}
+
+static void shl_export_json_input_tensor(std::vector<struct csinn_tensor *> in_tensor, json &jobj)
+{
+    jobj["inputs"] = {};
+    for (auto t : in_tensor) {
+        jobj["inputs"].push_back(shl_export_json_tensor(t));
+    }
+}
+
+static void shl_export_json_output_tensor(std::vector<struct csinn_tensor *> out_tensor, json &jobj)
+{
+    jobj["outputs"] = {};
+    for (auto t : out_tensor) {
+        jobj["outputs"].push_back(shl_export_json_tensor(t));
+    }
+}
+
+static void shl_export_json_params_base(struct csinn_params_base base, json &jobj)
+{
+    jobj["name"] = base.name;
+    jobj["quant_type"] = base.quant_type;
+    jobj["api"] = (enum csinn_api_enum)base.api;
+}
+
+static int shl_export_json_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params, std::string op_type,
+                                  json &jobj)
+{
+    json conv_data;
+    conv_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, conv_data);
+
+    json attrs;
+    attrs["group"] = params->group;
+    attrs["stride_height"] = params->stride_height;
+    attrs["stride_width"] = params->stride_width;
+    attrs["pad_top"] = params->pad_top;
+    attrs["pad_left"] = params->pad_left;
+    attrs["pad_down"] = params->pad_down;
+    attrs["pad_right"] = params->pad_right;
+    attrs["dilation_height"] = params->dilation_height;
+    attrs["dilation_width"] = params->dilation_width;
+    attrs["out_pad_height"] = params->out_pad_height;
+    attrs["out_pad_width"] = params->out_pad_width;
+
+    conv_data["attrs"] = attrs;
+
+    // insert input info
+    std::vector<struct csinn_tensor *> in_tensors = {input, kernel, bias};
+    shl_export_json_input_tensor(in_tensors, conv_data);
+
+    // insert output info
+    std::vector<struct csinn_tensor *> out_tensors = {output};
+    shl_export_json_output_tensor(out_tensors, conv_data);
+
+    jobj["layers"].push_back(conv_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_siso(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_params_base *params, std::string op_type, json &jobj)
+{
+    json siso_data;
+    siso_data["op_type"] = op_type;
+    shl_export_json_params_base(*params, siso_data);
+
+    // generate input info
+    std::vector<struct csinn_tensor *> in_tensors = {input};
+    shl_export_json_input_tensor(in_tensors, siso_data);
+
+    // generate output info
+    std::vector<struct csinn_tensor *> out_tensors = {output};
+    shl_export_json_output_tensor(out_tensors, siso_data);
+
+    jobj["layers"].push_back(siso_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_softmax_params *params, std::string op_type,
+                                   json &jobj)
+{
+    json softmax_data;
+    softmax_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, softmax_data);
+
+    // generate attrs
+    json attrs;
+    attrs["axis"] = params->axis;
+
+    // generate input info
+    shl_export_json_input_tensor({input}, softmax_data);
+
+    // generate output_info
+    shl_export_json_output_tensor({output}, softmax_data);
+
+    jobj["layers"].push_back(softmax_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_diso(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                struct csinn_tensor *output, struct csinn_diso_params *params,
+                                std::string op_type, json &jobj)
+{
+    json diso_data;
+    diso_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, diso_data);
+
+    // generate input info
+    std::vector<struct csinn_tensor *> in_tensors = {input0, input1};
+    shl_export_json_input_tensor(in_tensors, diso_data);
+
+    // generate output info
+    std::vector<struct csinn_tensor *> out_tensors = {output};
+    shl_export_json_output_tensor(out_tensors, diso_data);
+
+    jobj["layers"].push_back(diso_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_pool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params, std::string op_type, json &jobj)
+{
+    json pool_data;
+    pool_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, pool_data);
+
+    // generate attrs info
+    json attrs;
+    attrs["pool_type"] = params->pool_type;
+    attrs["filter_height"] = params->filter_height;
+    attrs["filter_width"] = params->filter_width;
+    attrs["filter_depth"] = params->filter_depth;
+    attrs["stride_height"] = params->stride_height;
+    attrs["stride_width"] = params->stride_width;
+    attrs["stride_depth"] = params->stride_depth;
+    attrs["pad_top"] = params->pad_top;
+    attrs["pad_left"] = params->pad_left;
+    attrs["pad_down"] = params->pad_down;
+    attrs["pad_right"] = params->pad_right;
+    attrs["pad_front"] = params->pad_front;
+    attrs["pad_back"] = params->pad_back;
+    attrs["ceil_mode"] = params->ceil_mode;
+    attrs["count_include_pad"] = params->count_include_pad;
+    pool_data["attrs"] = attrs;
+
+    // generate input info
+    std::vector<struct csinn_tensor *> in_tensors = {input};
+    shl_export_json_input_tensor(in_tensors, pool_data);
+
+    // generate output info
+    std::vector<struct csinn_tensor *> out_tensors = {output};
+    shl_export_json_output_tensor(out_tensors, pool_data);
+
+    jobj["layers"].push_back(pool_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_reshape_params *params, std::string op_type,
+                                   json &jobj)
+{
+    json reshape_data;
+    reshape_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, reshape_data);
+
+    // generate attrs
+    json attrs;
+    std::vector<int32_t> shape(params->shape, params->shape + params->shape_num);
+    attrs["reshape"] = shape;
+    reshape_data["attrs"] = attrs;
+
+    // generate input info
+    shl_export_json_input_tensor({input}, reshape_data);
+
+    // generate output_info
+    shl_export_json_output_tensor({output}, reshape_data);
+
+    jobj["layers"].push_back(reshape_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_fc_params *params, std::string op_type,
+                                          json &jobj)
+{
+    json fcl_data;
+    fcl_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, fcl_data);
+
+    json attrs;
+    attrs["units"] = params->units;
+    fcl_data["attrs"] = attrs;
+
+    // insert input info
+    std::vector<struct csinn_tensor *> in_tensors = {input, kernel, bias};
+    shl_export_json_input_tensor(in_tensors, fcl_data);
+
+    // insert output info
+    std::vector<struct csinn_tensor *> out_tensors = {output};
+    shl_export_json_output_tensor(out_tensors, fcl_data);
+
+    jobj["layers"].push_back(fcl_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_lrn(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_lrn_params *params, std::string op_type, json &jobj)
+{
+    json lrn_data;
+    lrn_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, lrn_data);
+
+    // generate attrs
+    json attrs;
+    attrs["range"] = params->range;
+    attrs["bias"] = params->bias;
+    attrs["alpha"] = params->alpha;
+    attrs["beta"] = params->beta;
+    attrs["norm_region"] = params->norm_region;
+    lrn_data["attrs"] = attrs;
+
+    // generate input info
+    shl_export_json_input_tensor({input}, lrn_data);
+
+    // generate output_info
+    shl_export_json_output_tensor({output}, lrn_data);
+
+    jobj["layers"].push_back(lrn_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                                  struct csinn_concat_params *params, std::string op_type,
+                                  json &jobj)
+{
+    json concat_data;
+    concat_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, concat_data);
+
+    // generate attrs
+    json attrs;
+    attrs["inputs_count"] = params->inputs_count;
+    attrs["axis"] = params->axis;
+    concat_data["attrs"] = attrs;
+
+    // generate input info
+    shl_export_json_input_tensor(
+        std::vector<struct csinn_tensor *>(input, input + params->inputs_count), concat_data);
+
+    // generate output_info
+    shl_export_json_output_tensor({output}, concat_data);
+
+    jobj["layers"].push_back(concat_data);
+
+    return CSINN_TRUE;
+}
+
+static int shl_export_json_prelu(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                 struct csinn_tensor *output, struct csinn_prelu_params *params,
+                                 std::string op_type, json &jobj)
+{
+    json prelu_data;
+    prelu_data["op_type"] = op_type;
+    shl_export_json_params_base(params->base, prelu_data);
+
+    // generate attrs
+    json attrs;
+    attrs["axis"] = params->axis;
+    prelu_data["attrs"] = attrs;
+
+    // generate input info
+    std::vector<struct csinn_tensor *> in_tensors = {input0, input1};
+    shl_export_json_input_tensor(in_tensors, prelu_data);
+
+    // generate output info
+    std::vector<struct csinn_tensor *> out_tensors = {output};
+    shl_export_json_output_tensor(out_tensors, prelu_data);
+
+    jobj["layers"].push_back(prelu_data);
+
+    return CSINN_TRUE;
+}
+
+int shl_export_json_internal(struct csinn_session *sess, char *path)
+{
+    json jobj;
+    struct shl_ref_graph *g = shl_gref_get_graph(sess);
+
+    // generate input names
+    jobj["input_names"] = {};
+    for (int i = 0; i < g->input_num; i++) {
+        jobj["input_names"].push_back(g->input[i]->name);
+    }
+
+    // generate output names
+    jobj["output_names"] = {};
+    for (int i = 0; i < g->output_num; i++) {
+        jobj["output_names"].push_back(g->output[i]->name);
+    }
+
+    // generate layers
+    jobj["layers"] = {};
+    for (int i = 0; i < g->layer_index; i++) {
+        struct shl_node *node = g->layer[i];
+        if (node->type == CSINN_SUBGRAPH) {
+            shl_debug_info("There is a subgrah that is ignored temporarily(TODO)\n");
+        } else if (node->type >= 0 && node->type < CSINN_OP_SIZE) {
+            struct csinn_params_base *params = (struct csinn_params_base *)node->data;
+            switch (node->type) {
+                case CSINN_OP_CONV2D: {
+                    int ret = shl_export_json_conv2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_tensor *)node->in[1]->data,
+                                                     (struct csinn_tensor *)node->in[2]->data,
+                                                     (struct csinn_conv2d_params *)params,
+                                                     "CSINN_OP_CONV2D", jobj);
+                    break;
+                }
+                case CSINN_OP_DEPTHWISE_CONV2D: {
+                    int ret = shl_export_json_conv2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_tensor *)node->in[1]->data,
+                                                     (struct csinn_tensor *)node->in[2]->data,
+                                                     (struct csinn_conv2d_params *)params,
+                                                     "CSINN_OP_DEPTHWISE_CONV2D", jobj);
+                    break;
+                }
+                case CSINN_OP_GROUP_CONV2D: {
+                    int ret = shl_export_json_conv2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_tensor *)node->in[1]->data,
+                                                     (struct csinn_tensor *)node->in[2]->data,
+                                                     (struct csinn_conv2d_params *)params,
+                                                     "CSINN_OP_GROUP_CONV2D", jobj);
+                    break;
+                }
+                case CSINN_OP_DECONV2D: {
+                    int ret = shl_export_json_conv2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_tensor *)node->in[1]->data,
+                                                     (struct csinn_tensor *)node->in[2]->data,
+                                                     (struct csinn_conv2d_params *)params,
+                                                     "CSINN_OP_DECONV2D", jobj);
+                    break;
+                }
+                case CSINN_OP_DEPTHWISE_DECONV2D: {
+                    int ret = shl_export_json_conv2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_tensor *)node->in[1]->data,
+                                                     (struct csinn_tensor *)node->in[2]->data,
+                                                     (struct csinn_conv2d_params *)params,
+                                                     "CSINN_OP_DEPTHWISE_DECONV2D", jobj);
+                    break;
+                }
+                case CSINN_OP_GROUP_DECONV2D: {
+                    int ret = shl_export_json_conv2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_tensor *)node->in[1]->data,
+                                                     (struct csinn_tensor *)node->in[2]->data,
+                                                     (struct csinn_conv2d_params *)params,
+                                                     "CSINN_OP_GROUP_DECONV2D", jobj);
+                    break;
+                }
+                case CSINN_OP_RELU: {
+                    int ret = shl_export_json_siso((struct csinn_tensor *)node->in[0]->data,
+                                                   (struct csinn_tensor *)node->out[0]->data,
+                                                   params, "CSINN_OP_RELU", jobj);
+                    break;
+                }
+                case CSINN_OP_GLOBAL_AVGPOOL2D: {
+                    int ret = shl_export_json_siso((struct csinn_tensor *)node->in[0]->data,
+                                                   (struct csinn_tensor *)node->out[0]->data,
+                                                   params, "CSINN_OP_GLOBAL_AVGPOOL2D", jobj);
+                    break;
+                }
+                case CSINN_OP_SOFTMAX: {
+                    int ret = shl_export_json_softmax((struct csinn_tensor *)node->in[0]->data,
+                                                      (struct csinn_tensor *)node->out[0]->data,
+                                                      (struct csinn_softmax_params *)params,
+                                                      "CSINN_OP_SOFTMAX", jobj);
+                    break;
+                }
+                case CSINN_OP_ADD: {
+                    int ret = shl_export_json_diso((struct csinn_tensor *)node->in[0]->data,
+                                                   (struct csinn_tensor *)node->in[1]->data,
+                                                   (struct csinn_tensor *)node->out[0]->data,
+                                                   (struct csinn_diso_params *)params,
+                                                   "CSINN_OP_ADD", jobj);
+                    break;
+                }
+                case CSINN_OP_MUL: {
+                    int ret = shl_export_json_diso((struct csinn_tensor *)node->in[0]->data,
+                                                   (struct csinn_tensor *)node->in[1]->data,
+                                                   (struct csinn_tensor *)node->out[0]->data,
+                                                   (struct csinn_diso_params *)params,
+                                                   "CSINN_OP_MUL", jobj);
+                    break;
+                }
+                case CSINN_OP_MAXPOOL2D: {
+                    int ret = shl_export_json_pool2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_pool_params *)params,
+                                                     "CSINN_OP_MAXPOOL2D", jobj);
+                    break;
+                }
+                case CSINN_OP_AVGPOOL2D: {
+                    int ret = shl_export_json_pool2d((struct csinn_tensor *)node->in[0]->data,
+                                                     (struct csinn_tensor *)node->out[0]->data,
+                                                     (struct csinn_pool_params *)params,
+                                                     "CSINN_OP_AVGPOOL2D", jobj);
+                    break;
+                }
+                case CSINN_OP_RESHAPE: {
+                    int ret = shl_export_json_reshape((struct csinn_tensor *)node->in[0]->data,
+                                                      (struct csinn_tensor *)node->out[0]->data,
+                                                      (struct csinn_reshape_params *)params,
+                                                      "CSINN_OP_RESHAPE", jobj);
+                    break;
+                }
+                case CSINN_OP_FULLYCONNECTED: {
+                    int ret = shl_export_json_fullyconnected(
+                        (struct csinn_tensor *)node->in[0]->data,
+                        (struct csinn_tensor *)node->out[0]->data,
+                        (struct csinn_tensor *)node->in[1]->data,
+                        (struct csinn_tensor *)node->in[2]->data, (struct csinn_fc_params *)params,
+                        "CSINN_OP_FULLYCONNECTED", jobj);
+                    break;
+                }
+                case CSINN_OP_LRN: {
+                    int ret = shl_export_json_lrn((struct csinn_tensor *)node->in[0]->data,
+                                                  (struct csinn_tensor *)node->out[0]->data,
+                                                  (struct csinn_lrn_params *)params, "CSINN_OP_LRN",
+                                                  jobj);
+                    break;
+                }
+                case CSINN_OP_CONCAT: {
+                    struct csinn_tensor **inputs = (struct csinn_tensor **)shl_mem_alloc(
+                        sizeof(struct csinn_tensor *) *
+                        ((struct csinn_concat_params *)params)->inputs_count);
+                    for (int i = 0; i < ((struct csinn_concat_params *)params)->inputs_count; i++) {
+                        inputs[i] = (struct csinn_tensor *)node->in[i]->data;
+                    }
+                    int ret = shl_export_json_concat(
+                        inputs, (struct csinn_tensor *)node->out[0]->data,
+                        (struct csinn_concat_params *)params, "CSINN_OP_CONCAT", jobj);
+                    shl_mem_free(inputs);
+                    break;
+                }
+                case CSINN_OP_PRELU: {
+                    int ret = shl_export_json_prelu((struct csinn_tensor *)node->in[0]->data,
+                                                    (struct csinn_tensor *)node->in[1]->data,
+                                                    (struct csinn_tensor *)node->out[0]->data,
+                                                    (struct csinn_prelu_params *)params,
+                                                    "CSINN_OP_PRELU", jobj);
+                    break;
+                }
+                default: {
+                    shl_debug_error("unknown op: %d\n", node->type);
+                }
+            }
+        }
+    }
+
+    std::ofstream out_file(path);
+    out_file << std::setw(2) << jobj << std::endl;
+
+    return CSINN_TRUE;
+}
+
+#endif
\ No newline at end of file
diff --git a/source/utils/export_json_wrapper.h b/source/utils/export_json_wrapper.h
new file mode 100644
index 00000000..117f901b
--- /dev/null
+++ b/source/utils/export_json_wrapper.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_EXPORT_JSON_WRAPPER_H_
+#define INCLUDE_EXPORT_JSON_WRAPPER_H_
+
+#ifdef SHL_EXPORT_MODEL
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int shl_export_json_internal(struct csinn_session* sess, char* path);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif  // INCLUDE_EXPORT_JSON_WRAPPER_H_
\ No newline at end of file
diff --git a/source/utils/memory.c b/source/utils/memory.c
index 92cbaeac..0a75cab2 100644
--- a/source/utils/memory.c
+++ b/source/utils/memory.c
@@ -17,7 +17,7 @@
  */
 #include <unistd.h>
 
-#include "csi_nn.h"
+#include "shl_utils.h"
 
 // #define SHL_MEM_DEBUG
 // #define SHL_MEM_DEBUG_VALID_WRITE
diff --git a/version b/version
index 197c4d5c..e70b4523 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-2.4.0
+2.6.0