Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
sealad886 authored Jun 11, 2024
2 parents 1a4974b + 4bfe50f commit 563064f
Show file tree
Hide file tree
Showing 4 changed files with 367 additions and 12 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
Expand Down Expand Up @@ -684,7 +684,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-latest
runs-on: windows-2019

env:
OPENBLAS_VERSION: 0.3.23
Expand Down Expand Up @@ -829,7 +829,7 @@ jobs:
name: llama-bin-win-${{ matrix.build }}.zip

windows-latest-cmake-cuda:
runs-on: windows-latest
runs-on: windows-2019

strategy:
matrix:
Expand All @@ -843,8 +843,9 @@ jobs:
with:
fetch-depth: 0

- uses: Jimver/cuda-toolkit@v0.2.11
- name: Install CUDA toolkit
id: cuda-toolkit
uses: Jimver/[email protected]
with:
cuda: ${{ matrix.cuda }}
method: 'network'
Expand Down
66 changes: 66 additions & 0 deletions ggml-cuda/mma.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
#include "common.cuh"

struct mma_int_A_I16K4 {
static constexpr int I = 16;
static constexpr int K = 4;
static constexpr int ne = 2;

int x[ne] = {0};

static __device__ __forceinline__ int get_i(const int l) {
const int ret = (l%2) * (I/2) + threadIdx.x / K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < I);
return ret;
}

static __device__ __forceinline__ int get_k(const int /* l */) {
const int ret = threadIdx.x % K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};

struct mma_int_A_I16K8 {
static constexpr int I = 16;
static constexpr int K = 8;
Expand All @@ -22,6 +44,28 @@ struct mma_int_A_I16K8 {
}
};

struct mma_int_B_J8K4 {
static constexpr int J = 8;
static constexpr int K = 4;
static constexpr int ne = 1;

int x[ne] = {0};

static __device__ __forceinline__ int get_j(const int /* l */) {
const int ret = threadIdx.x / K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < J);
return ret;
}

static __device__ __forceinline__ int get_k(const int /* l */) {
const int ret = threadIdx.x % K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};

struct mma_int_B_J8K8 {
static constexpr int J = 8;
static constexpr int K = 8;
Expand Down Expand Up @@ -65,6 +109,28 @@ struct mma_int_C_I16J8 {
return ret;
}

__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
#ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#else
// On Turing m16n8k16 mma is not available, use 2x m8n8k16 mma instead:
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[0]), "+r"(x[1])
: "r"(mma_A.x[0]), "r"(mma_B.x[0]));
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#endif // __CUDA_ARCH__ >= CC_AMPERE
#else
GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B);
NO_DEVICE_CODE;
#endif // INT8_MMA_AVAILABLE
}

__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
#ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE
Expand Down
Loading

0 comments on commit 563064f

Please sign in to comment.