From ec2aef52e940286f9327e3efadf854c25966dc20 Mon Sep 17 00:00:00 2001
From: Marko Tasic <mtasic85@gmail.com>
Date: Mon, 29 Jul 2024 17:49:55 +0200
Subject: [PATCH] Added:     - Support for default CPU tinyBLAS (llamafile,
 sgemm) builds     - Support for CPU OpenBLAS (GGML_OPENBLAS) builds

---
 CHANGELOG.md     |   4 ++
 pyproject.toml   |   8 +--
 scripts/build.py | 146 +++++++++++++++++++++++++++--------------------
 3 files changed, 93 insertions(+), 65 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ffa0193..860c107 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## v0.1.9
 
+Added:
+    - Support for default CPU tinyBLAS (llamafile, sgemm) builds
+    - Support for CPU OpenBLAS (GGML_OPENBLAS) builds
+
 Changed:
     - Build scripts now have separate step/function `cuda_12_5_1_setup` which setups CUDA 12.5.1 env for build-time.
 
diff --git a/pyproject.toml b/pyproject.toml
index f5e1fcd..0dfd971 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,15 +51,15 @@ pip install poetry
 if [[ $AUDITWHEEL_PLAT == manylinux* ]]; then
     dnf update -y
     dnf install -y epel-release
-    dnf install -y unzip p7zip p7zip-plugins
+    dnf install -y unzip p7zip p7zip-plugins openblas openblas-devel
 else
-    apk -U add upx unzip 7zip
+    apk -U add upx unzip 7zip openblas-dev
     apk add --repository=https://dl-cdn.alpinelinux.org/alpine/v3.16/main/ libexecinfo-dev
 fi
 """
 # skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "pp37-*", "pp38-*", "pp39-*", "*i686"]
-skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "pp37-*", "pp38-*", "pp39-*", "*i686", "*manylinux*"]
-# skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*", "pp310-*", "*i686"]
+# skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "pp37-*", "pp38-*", "pp39-*", "*i686", "*manylinux*"]
+skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*", "pp310-*", "*i686"]
 manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:latest"
 manylinux-aarch64-image = "quay.io/pypa/manylinux_2_28_aarch64:latest"
 manylinux-pypy_x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:latest"
diff --git a/scripts/build.py b/scripts/build.py
index 37caf8b..73765c8 100644
--- a/scripts/build.py
+++ b/scripts/build.py
@@ -9,6 +9,11 @@
 from clean import clean_llama, clean_llama_cpp, clean
 
 
+# if 'PYODIDE' in env and env['PYODIDE'] == '1':
+#     env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
+#     env['UNAME_M'] = 'wasm'
+
+
 def clone_llama_cpp():
     subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True)
     subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_3.patch'], check=True)
@@ -71,11 +76,7 @@ def build_cpu(*args, **kwargs):
     # build static and shared library
     env = os.environ.copy()
     env['CXXFLAGS'] = '-O3'
-    
-    # if 'PYODIDE' in env and env['PYODIDE'] == '1':
-    #     env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
-    #     env['UNAME_M'] = 'wasm'
-
+    print('build_cpu:')
     pprint(env)
 
     #
@@ -88,7 +89,6 @@ def build_cpu(*args, **kwargs):
         '-j',
         'llama-cli-static',
         'GGML_NO_OPENMP=1',
-        'GGML_NO_LLAMAFILE=1',
     ], check=True, env=env)
 
     #
@@ -132,21 +132,74 @@ def build_cpu(*args, **kwargs):
         shutil.move(file, 'llama/')
 
 
-def build_linux_cuda_12_5(*args, **kwargs):
+def build_cpu_openblas(*args, **kwargs):
     # build static and shared library
     env = os.environ.copy()
+    env['CXXFLAGS'] = '-O3'
+    print('build_cpu_openblas:')
+    pprint(env)
 
-    # if 'PYODIDE' in env and env['PYODIDE'] == '1':
-    #     env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
-    #     env['UNAME_M'] = 'wasm'
+    #
+    # build llama.cpp
+    #
+    subprocess.run([
+        'make',
+        '-C',
+        'llama.cpp',
+        '-j',
+        'llama-cli-static',
+        'GGML_NO_OPENMP=1',
+        'GGML_OPENBLAS=1',
+    ], check=True, env=env)
+
+    #
+    # cffi
+    #
+    ffibuilder = FFI()
+
+    ffibuilder.cdef('''
+        typedef void (*_llama_yield_token_t)(const char * token);
+        typedef int (*_llama_should_stop_t)(void);
+        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
+    ''')
+
+    ffibuilder.set_source(
+        '_llama_cli_cpu_openblas',
+        '''
+        #include <stdio.h>
+        
+        typedef void (*_llama_yield_token_t)(const char * token);
+        typedef int (*_llama_should_stop_t)(void);
+        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
+        ''',
+        libraries=['stdc++'],
+        extra_objects=['../llama.cpp/llama_cli.a'],
+        extra_compile_args=['-O3'],
+        extra_link_args=['-O3', '-flto'],
+    )
+
+    ffibuilder.compile(tmpdir='build', verbose=True)
+
+    #
+    # copy compiled modules
+    #
+    for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'):
+        shutil.move(file, 'llama/')
+
+    for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'):
+        shutil.move(file, 'llama/')
+
+    for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
+        shutil.move(file, 'llama/')
+
+
+def build_linux_cuda_12_5(*args, **kwargs):
+    # build static and shared library
+    env = os.environ.copy()
 
     #
     # cuda env
     #
-    # cuda_file = 'cuda_12.5.1_555.42.06_linux.run'
-    # cuda_url = f'https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/{cuda_file}'
-    # cuda_output_dir = os.path.abspath('./cuda-12.5.1')
-    # cuda_file_path = os.path.join(cuda_output_dir, cuda_file)
     cuda_output_dir = cuda_12_5_1_setup()
 
     env['PATH'] =  f'{cuda_output_dir}/dist/bin:{env["PATH"]}'
@@ -161,48 +214,9 @@ def build_linux_cuda_12_5(*args, **kwargs):
             -gencode arch=compute_89,code=sm_89 \
             -gencode arch=compute_90,code=sm_90'
 
+    print('build_linux_cuda_12_5:')
     pprint(env)
 
-    # # download cuda file
-    # if not os.path.exists(cuda_file_path):
-    #     cmd = ['mkdir', '-p', f'{cuda_output_dir}']
-    # 
-    #     subprocess.run(cmd, check=True)
-    #     subprocess.run(['curl', '-o', cuda_file_path, cuda_url], check=True)
-    
-    # # extract cuda file
-    # cmd = ['chmod', '+x', f'{cuda_output_dir}/{cuda_file}']
-    # subprocess.run(cmd, check=True)
-    #
-    # cmd = [
-    #     f'{cuda_output_dir}/{cuda_file}',
-    #     '--tar',
-    #     'mxf',
-    #     '--wildcards',
-    #     './builds/cuda_cccl/*',
-    #     './builds/cuda_cudart/*',
-    #     './builds/cuda_nvcc/*',
-    #     './builds/libcublas/*',
-    #     '-C',
-    #     cuda_output_dir,
-    # ]
-    # subprocess.run(cmd, cwd=cuda_output_dir, check=True)
-    #
-    # cmd = ['mkdir', '-p', f'{cuda_output_dir}/dist']
-    # subprocess.run(cmd, check=True)
-    #
-    # cmd = f'cp -r {cuda_output_dir}/builds/cuda_cccl/* {cuda_output_dir}/dist'
-    # subprocess.run(cmd, shell=True, check=True)
-    #
-    # cmd = f'cp -r {cuda_output_dir}/builds/cuda_cudart/* {cuda_output_dir}/dist'
-    # subprocess.run(cmd, shell=True, check=True)
-    #
-    # cmd = f'cp -r {cuda_output_dir}/builds/cuda_nvcc/* {cuda_output_dir}/dist'
-    # subprocess.run(cmd, shell=True, check=True)
-    #
-    # cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist'
-    # subprocess.run(cmd, shell=True, check=True)
-
     #
     # build llama.cpp
     #
@@ -213,7 +227,6 @@ def build_linux_cuda_12_5(*args, **kwargs):
         '-j',
         'llama-cli-static',
         'GGML_NO_OPENMP=1',
-        'GGML_NO_LLAMAFILE=1',
         'GGML_CUDA=1',
     ], check=True, env=env)
 
@@ -271,19 +284,30 @@ def build_linux_cuda_12_5(*args, **kwargs):
 
 
 def build(*args, **kwargs):
+    env = os.environ.copy()
+    env['GGML_CPU'] = '1'
+    env['GGML_OPENBLAS'] = '0'
+    env['GGML_CUDA'] = '0'
+
     # clean, clone
     clean()
     clone_llama_cpp()
 
-    # cuda 12.5
-    if os.environ.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and os.environ.get('AUDITWHEEL_ARCH') in ('x86_64', None):
+    # cpu
+    if env.get('GGML_CPU', '1') != '0':
         clean_llama_cpp()
-        build_linux_cuda_12_5(*args, **kwargs)
+        build_cpu(*args, **kwargs)
 
-    # cpu
-    clean_llama_cpp()
-    build_cpu(*args, **kwargs)
+    # openblas
+    if env.get('GGML_OPENBLAS', '1') != '0':
+        clean_llama_cpp()
+        build_cpu_openblas(*args, **kwargs)
 
+    # cuda 12.5
+    if env.get('GGML_CUDA', '1') != '0':
+        if env.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and env.get('AUDITWHEEL_ARCH') in ('x86_64', None):
+            clean_llama_cpp()
+            build_linux_cuda_12_5(*args, **kwargs)
 
 if __name__ == '__main__':
     build()