Skip to content

Commit

Permalink
Integrate kleidiAI release v0.1.0 into MNN 2.9.3
Browse files Browse the repository at this point in the history
Put KleidiAI files in folder source/backend/cpu/arm/kleidiAI/kai,
download from arm gitlab and remain unchanged. Maybe will remove
these files and download them when build.

MNNKleidiAI.cpp is interface between MNN and KleidiAI.

Rewrite function in class DenseConvInt8TiledExecutor
, in ConvInt8TiledExecutor.cpp, to call KleidiAI functions.
Maybe implement a new execution later.

Changes to GeometryConvUtils.cpp and ShapeTensorConvert.cpp are for
the input and output of DenseConvInt8TiledExecutor is NCHW,
rather than NC4HW4, to avoid redundant pack/unpack and get better
performance.
  • Loading branch information
xhzheng1895 committed Aug 16, 2024
1 parent b11b703 commit 19b399b
Show file tree
Hide file tree
Showing 18 changed files with 2,129 additions and 2 deletions.
7 changes: 7 additions & 0 deletions source/backend/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,10 @@ IF(MNN_ARM82)
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
ENDIF()

# Kleidi AI
IF(MNN_KLEIDIAI)
add_definitions(-DMNN_KLEIDIAI_ENABLED=1)
include(${CMAKE_CURRENT_LIST_DIR}/arm/kleidiAI/CMakeLists.txt)
list(APPEND MNN_TARGETS MNN_KleidiAI)
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_KleidiAI>)
ENDIF()
4 changes: 4 additions & 0 deletions source/backend/cpu/CPUBackend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
#include "core/BufferAllocator.hpp"
#include "MNN_generated.h"

#ifdef MNN_KLEIDIAI_ENABLED
#include "arm/kleidiAI/MNNKleidiAI.h"
#endif

namespace MNN {
class CPURuntime : public Runtime {
public:
Expand Down
34 changes: 34 additions & 0 deletions source/backend/cpu/arm/kleidiAI/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
list(APPEND MNN_KleidiAI_SOURCES ${CMAKE_CURRENT_LIST_DIR}/MNNKleidiAI.cpp)
list(APPEND MNN_KleidiAI_HEADERS ${CMAKE_CURRENT_LIST_DIR}/MNNKleidiAI.h)

include_directories(
${CMAKE_CURRENT_LIST_DIR}/
${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/
${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/matmul/
${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/
${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/matmul/pack/)

list(APPEND MNN_KleidiAI_SOURCES ${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c)
list(APPEND MNN_KleidiAI_SOURCES ${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.c)
list(APPEND MNN_KleidiAI_SOURCES ${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c)
list(APPEND MNN_KleidiAI_SOURCES ${CMAKE_CURRENT_LIST_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c)


add_library(
MNN_KleidiAI
SHARED
${MNN_KleidiAI_SOURCES} ${MNN_KleidiAI_HEADERS}
)

# Enable ARMv8.6-A features
target_compile_definitions(MNN_KleidiAI PRIVATE
__ARM_FEATURE_MATMUL_INT8
__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
__ARM_FEATURE_BF16_SCALAR_ARITHMETIC
__ARM_BF16_FORMAT_ALTERNATIVE
__ARM_FEATURE_DOTPROD
)

target_compile_options(MNN_KleidiAI
PRIVATE -march=armv8.6-a
)
215 changes: 215 additions & 0 deletions source/backend/cpu/arm/kleidiAI/MNNKleidiAI.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#if defined(__aarch64__)

#include "MNNKleidiAI.h"

using namespace MNN;

KleidiAI *KleidiAI::instance = NULL;

inline static size_t kai_k_roundedup(size_t k, size_t kr, size_t sr) {
// Since we pack a float and int32 value at the end of the row,
// we must make sure that k is a multiple of 4 for memory alignment.
size_t kr_sr_roundedup4 = kai_roundup(kr * sr, 4);
return kai_roundup(k, kr_sr_roundedup4);
}

static void packQsi4cxpQsi8cxs1s0(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, const uint8_t* rhs, const float* bias,
const float* scale, void* rhs_packed, size_t extra_bytes,
const struct kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0_params* params) {
KAI_ASSERT(num_groups == 1);
KAI_ASSERT(extra_bytes == 0);
KAI_ASSERT((kr % sr) == 0);
KAI_ASSERT(rhs != NULL);
KAI_ASSERT(scale != NULL);
KAI_ASSERT(rhs_packed != NULL);
KAI_ASSERT(params != NULL);
KAI_ASSERT(params->rhs_zero_point == 8);
KAI_ASSERT(params->lhs_zero_point == 1);

const size_t rhs_zero_point = params->rhs_zero_point;
const size_t rhs_packed_stride = kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(k, nr, kr, sr);
const size_t k_internal = kai_k_roundedup(k, kr, sr);
const size_t dst_num_rows = kai_roundup(n, nr) / nr;
const size_t dst_num_bytes_per_row = nr * (kai_k_roundedup(k, kr, sr) / 2);
const size_t block_length_in_bytes = kr / sr;
const size_t k_interleaved_v = 16U;
const size_t rhs_stride = kai_roundup(k, 2);

for (size_t dst_row_idx = 0; dst_row_idx < dst_num_rows; ++dst_row_idx) {
uint8_t* dst_row = (uint8_t*)rhs_packed + dst_row_idx * rhs_packed_stride;

int32_t* sums = (int32_t*)(dst_row + nr * (k_internal / 2));

// Initialize to zero the RHS reduction sums
memset(sums, 0, nr * sizeof(int32_t));

for (size_t dst_byte_idx = 0; dst_byte_idx < dst_num_bytes_per_row; ++dst_byte_idx) {
const size_t block_idx = dst_byte_idx / block_length_in_bytes;
const size_t block_byte_idx = dst_byte_idx % block_length_in_bytes;
const size_t super_block_idx = block_idx / nr;
const size_t nr_idx = block_idx % nr;

const size_t k_adjustment =
((block_byte_idx + super_block_idx * block_length_in_bytes) / k_interleaved_v) * k_interleaved_v;
const size_t k0_idx = block_byte_idx + super_block_idx * block_length_in_bytes + k_adjustment;
const size_t k1_idx = k0_idx + k_interleaved_v;
const size_t n0_idx = dst_row_idx * nr + nr_idx;

// Clamp the index to avoid out-of-bound reads
const size_t n0_valid_idx = KAI_MIN(n0_idx, n - 1);

const size_t src_addr_byte0 = k0_idx + n0_valid_idx * rhs_stride;
const size_t src_addr_byte1 = k1_idx + n0_valid_idx * rhs_stride;

int8_t byte0 = 0;
int8_t byte1 = 0;

if (k0_idx < k) {
byte0 = rhs[src_addr_byte0];
}

if (k1_idx < k) {
byte1 = rhs[src_addr_byte1];
}

sums[nr_idx] += (int32_t)byte0 + (int32_t)byte1;

const uint8_t dst_qs0 = (byte0 + rhs_zero_point) | ((byte1 + rhs_zero_point) << 4);

*dst_row = dst_qs0 ^ 0x88;
dst_row += sizeof(uint8_t);
}

// Adjust the reduction sums
for (size_t i = 0; i < nr; ++i) {
sums[i] = sums[i] * 16;
dst_row += sizeof(int32_t);
}

// Adjust the scales
for (size_t i = 0; i < nr; ++i) {
// Clamp the row index to avoid out-of-bound reads
const size_t src_row_idx = KAI_MIN(dst_row_idx * nr + i, n - 1);
*((float*)(dst_row)) = scale[src_row_idx] * 0.0625F;
dst_row += sizeof(float);
}

// Set the bias
if (bias == NULL) {
memset(dst_row, 0, nr * sizeof(float));
} else {
for (size_t i = 0; i < nr; ++i) {
// Clamp the row index to avoid out-of-bound reads
const size_t src_row_idx = KAI_MIN(dst_row_idx * nr + i, n - 1);
((float*)dst_row)[i] = bias[src_row_idx];
}
}
}
}

void KleidiAI::packNCHWToNC4HW4(float* data, size_t rowNum, size_t rowSize) {
if(rowNum == 1) {
return;
}

const size_t tmp_size = rowNum * rowSize * sizeof(float);
uint8_t *tmpBuffer = new uint8_t[tmp_size];
memcpy(tmpBuffer, data, tmp_size);

const float *src = (const float *)tmpBuffer;
float *dst = (float *)data;

size_t blockNum = rowSize / 4;
size_t blockSize = 4 * sizeof(float);

for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
const float *rowSrc = src + blockIndex * 4;
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
memcpy(dst, rowSrc, blockSize);
dst += 4;
rowSrc += rowSize;
}
}
}

void KleidiAI::packNC4HW4ToNCHW(float* data, size_t rowNum, size_t rowSize) {
if(rowNum == 1) {
return;
}

const size_t tmp_size = rowNum * rowSize * sizeof(float);
uint8_t *tmpBuffer = new uint8_t[tmp_size];
memcpy(tmpBuffer, data, tmp_size);

const float *src = (const float *)tmpBuffer;
float *dst = (float *)data;

size_t blockNum = rowSize / 4;
size_t blockSize = 4 * sizeof(float);

for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
const float *rowSrc = src + blockIndex * 4 * rowNum;
float *block_dst = dst + blockIndex * 4;
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
memcpy(block_dst, rowSrc, blockSize);
block_dst += rowSize;
rowSrc += 4;
}
}
}

//Lhs
size_t KleidiAI::getLhsQuantedPackedSize(size_t m, size_t k) {
return kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(m, k, getMr(m), getKr(), getSr());
}

size_t KleidiAI::getLhsQuantedPackedOffset(size_t m, size_t mIdx, size_t k) {
return mIdx == 0 ? 0 : kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(mIdx, k, getMr(m), getKr(), getSr());
}

void KleidiAI::runLhsQuantPack(size_t m, size_t k, const void* lhs, void* lhsQuantedPacked) {
kai_run_lhs_quant_pack_qai8dxp_f32(m, k, getMr(m), getKr(), getSr(), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
}

//Rhs
size_t KleidiAI::getRhsPackedSize(size_t n, size_t k) {
return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(n, k, getNr(), getKr(), getSr());
}

size_t KleidiAI::getRhsPackedOffset(size_t nIdx, size_t k) {
return kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(nIdx, k, getNr(), getKr(), getSr());
}

void KleidiAI::runRhsPack(size_t n, size_t k, const void* rhs, const void* scale, const void *bias, void* rhsPacked, bool packedInt4) {
struct kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0_params params;
params.lhs_zero_point = 1;
params.rhs_zero_point = 8;
if(!packedInt4) {
packQsi4cxpQsi8cxs1s0(1, n, k, getNr(), getKr(), getSr(),
(const uint8_t *)rhs,
(const float *)bias, (const float *)scale,
rhsPacked,
0, &params);
} else {
kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(1, n, k, getNr(), getKr(), getSr(),
(const uint8_t *)rhs,
(const float *)bias, (const float *)scale,
rhsPacked,
0, &params);
}
}

//Matmul
void KleidiAI::runMatmul(size_t m, size_t n, size_t k, const void* lhsPacked, const void* rhsPacked, size_t dst_stride, void* dst) {
if(m == 1) { //dotprod
kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(m, n, k,
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
} else { //i8mm
kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(m, n, k,
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
}
}

#endif // defined(__aarch64__)
108 changes: 108 additions & 0 deletions source/backend/cpu/arm/kleidiAI/MNNKleidiAI.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#pragma once

#include <MNN/ErrorCode.hpp>
#include <core/Backend.hpp>
#include <core/Execution.hpp>
#include <core/TensorUtils.hpp>
#include <backend/cpu/CPUBackend.hpp>
#include <backend/cpu/CPURuntime.hpp>

#include <arm_neon.h>
#include <assert.h>
#include <cfloat>
#include <stdint.h>
#include <string.h>
#include <vector>

#include "kai_lhs_quant_pack_qai8dxp_f32.h"
#include "kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h"
#include "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h"
#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h"

#include "./kai/kai_common.h"

namespace MNN {
class KleidiAI {
public:
static KleidiAI &getInstance(bool bAsymmetric) {
if(!instance) {
instance = new KleidiAI(bAsymmetric);
}
return *instance;
}

static KleidiAI &getInstance() {
if(!instance) {
instance = new KleidiAI;
}
return *instance;
}

~KleidiAI() {}

struct KAIInfo {
bool kaiEnable = false;
bool asymmetric = false; //Asymmetric quantized model.
bool dot = false; //CPU support sdot.
bool i8mm = false; //CPU support i8mm.
};

//Kai util
void packNCHWToNC4HW4(float* data, size_t rowNum, size_t rowSize);
void packNC4HW4ToNCHW(float* data, size_t rowNum, size_t rowSize);

//Set info
void setEnable(bool enable) { mKAIInfo.kaiEnable = enable; }
void setModelAsymmetric(bool bAsymmetric) { mKAIInfo.asymmetric = bAsymmetric; }

//Check
bool canAccelerate() { return (mKAIInfo.kaiEnable && mKAIInfo.dot && mKAIInfo.i8mm && !mKAIInfo.asymmetric); }

//Get info
size_t getMr(size_t m = 1) { return (m == 1) ? mKaiMrDotprod : mKaiMrI8mm; }
size_t getNr() { return mKaiNr; }
size_t getKr() { return mKaiKr; }
size_t getSr() { return mKaiSr; }
size_t getMStep(size_t m = 1) { return (m == 1) ? mKaiMstepDotprod : mKaiMstepI8mm; }
size_t getNStep() { return mKaiNStep; }
size_t getVecNumPerThread(size_t totalVec, size_t totalThread, size_t minStep) { return kai_roundup(totalVec / totalThread, minStep); }

//Lhs
size_t getLhsQuantedPackedSize(size_t m, size_t k);
size_t getLhsQuantedPackedOffset(size_t m, size_t mIdx, size_t k);
void runLhsQuantPack(size_t m, size_t k, const void* lhs, void* lhsQuantedPacked);

//Rhs
size_t getRhsPackedSize(size_t n, size_t k);
size_t getRhsPackedOffset(size_t nIdx, size_t k);
void runRhsPack(size_t n, size_t k, const void* rhs, const void* scale, const void *bias, void* rhsPacked, bool packedInt4 = false);

//Dst
size_t getDstOffset(size_t mIdx, size_t nIdx, size_t n) { return (nIdx * sizeof(float)) + mIdx * (n * sizeof(float)); }

//Matmul
void runMatmul(size_t m, size_t n, size_t k, const void* lhsPacked, const void* rhsPacked, size_t dst_stride, void* dst);

private:
KleidiAI(bool bAsymmetric = false) {
const MNNCPUInfo& gCPUInfo = *MNNGetCPUInfo();
mKAIInfo.dot = gCPUInfo.dot;
mKAIInfo.i8mm = gCPUInfo.i8mm;
mKAIInfo.kaiEnable = true;
mKAIInfo.asymmetric = bAsymmetric;
}

static KleidiAI *instance;
KAIInfo mKAIInfo;

const size_t mKaiMstepDotprod = 1;
const size_t mKaiMstepI8mm = 8;
const size_t mKaiNStep = 4;

const size_t mKaiMrDotprod = 1;
const size_t mKaiMrI8mm = 4;
const size_t mKaiNr = 4;
const size_t mKaiKr = 16;
const size_t mKaiSr = 2;
};
}
Loading

0 comments on commit 19b399b

Please sign in to comment.