Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: merge issues
Browse files Browse the repository at this point in the history
JoanFM committed Jun 6, 2024
2 parents a8a64fd + f5d7b26 commit 4dc0fe9
Showing 124 changed files with 1,806 additions and 1,782 deletions.
2 changes: 1 addition & 1 deletion .devops/full-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all

RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1

COPY requirements.txt requirements.txt
COPY requirements requirements
2 changes: 1 addition & 1 deletion .devops/full.Dockerfile
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build

RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1

COPY requirements.txt requirements.txt
COPY requirements requirements
5 changes: 4 additions & 1 deletion .devops/main-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -23,10 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1

RUN make -j$(nproc)
RUN make -j$(nproc) main

FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

RUN apt-get update && \
apt-get install -y libgomp1

COPY --from=build /app/main /main

ENTRYPOINT [ "/main" ]
2 changes: 1 addition & 1 deletion .devops/main-rocm.Dockerfile
Original file line number Diff line number Diff line change
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++

RUN make -j$(nproc)
RUN make -j$(nproc) main

ENTRYPOINT [ "/app/main" ]
2 changes: 1 addition & 1 deletion .devops/main-vulkan.Dockerfile
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build

# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
RUN apt update && apt install -y git build-essential cmake wget libgomp1

# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
5 changes: 4 additions & 1 deletion .devops/main.Dockerfile
Original file line number Diff line number Diff line change
@@ -9,10 +9,13 @@ WORKDIR /app

COPY . .

RUN make -j$(nproc)
RUN make -j$(nproc) main

FROM ubuntu:$UBUNTU_VERSION as runtime

RUN apt-get update && \
apt-get install -y libgomp1

COPY --from=build /app/main /main

ENV LC_ALL=C.utf8
4 changes: 2 additions & 2 deletions .devops/server-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -25,12 +25,12 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1

RUN make -j$(nproc)
RUN make -j$(nproc) server

FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev libgomp1

COPY --from=build /app/server /server

4 changes: 2 additions & 2 deletions .devops/server.Dockerfile
Original file line number Diff line number Diff line change
@@ -11,12 +11,12 @@ COPY . .

ENV LLAMA_CURL=1

RUN make -j$(nproc)
RUN make -j$(nproc) server

FROM ubuntu:$UBUNTU_VERSION as runtime

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev libgomp1

COPY --from=build /app/server /server

4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -416,6 +416,8 @@ if (LLAMA_CUDA)
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})

add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@@ -588,6 +590,8 @@ if (LLAMA_HIPBLAS)
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})

add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)

1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -444,6 +444,7 @@ ifdef LLAMA_CUBLAS
endif

OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
ifdef LLAMA_CUDA_FA_ALL_QUANTS
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
else
3 changes: 2 additions & 1 deletion convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
@@ -82,7 +83,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
]

2 changes: 2 additions & 0 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from __future__ import annotations

@@ -480,6 +481,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
res = "jina-v2-zh"

if res is None:
logger.warning("\n")
logger.warning("**************************************************************************************")
6 changes: 6 additions & 0 deletions ggml-common.h
Original file line number Diff line number Diff line change
@@ -123,12 +123,18 @@ typedef sycl::half2 ggml_half2;
#define QI1_S (QK_K / (4*QR1_S))
#define QR1_S 8

#define QI1_M (QK_K / (4*QR1_M))
#define QR1_M 8

#define QI4_NL (QK4_NL / (4*QR4_NL))
#define QR4_NL 2

#define QI4_XS (QK_K / (4*QR4_XS))
#define QR4_XS 8

#define QI3_S (QK_K / (4*QR3_S))
#define QR3_S 8

#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

#define QK4_0 32
84 changes: 9 additions & 75 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
@@ -633,88 +633,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {

// cuda split buffer

static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
int64_t min_compute_capability = INT_MAX;
int64_t max_compute_capability = INT_MIN;
static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
int64_t row_rounding = 0;
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
min_compute_capability = ggml_cuda_info().devices[id].cc;
}
if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
max_compute_capability = ggml_cuda_info().devices[id].cc;
}
if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
continue;
}
}

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
switch(type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return 1;
case GGML_TYPE_Q2_K:
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
case GGML_TYPE_Q3_K:
return min_compute_capability < CC_RDNA2 ? 128 : 64;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
default:
GGML_ASSERT(false);
}
#else
switch(type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return max_compute_capability >= CC_VOLTA ? 128 : 64;
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
return 64;
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return 1;
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
return max_compute_capability >= CC_VOLTA ? 128 : 64;
case GGML_TYPE_Q6_K:
return 64;
default:
GGML_ASSERT(false);
const int cc = ggml_cuda_info().devices[id].cc;
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
}
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
return row_rounding;
}

static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
const int64_t nrows = ggml_nrows(tensor);
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
const int64_t rounding = get_row_rounding(tensor_split);

*row_low = id == 0 ? 0 : nrows*tensor_split[id];
*row_low -= *row_low % rounding;
@@ -1499,7 +1433,7 @@ static void ggml_cuda_op_mul_mat(
// for multi GPU, get the row boundaries from tensor split
// and round to mul_mat_q tile sizes
if (split) {
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
const int64_t rounding = get_row_rounding(tensor_split);

if (id != 0) {
dev[id].row_low = ne01*tensor_split[id];
Loading

0 comments on commit 4dc0fe9

Please sign in to comment.