diff --git a/.gitignore b/.gitignore index 63fc1fd..cc43593 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Build +build + # Test input tests/add_* tests/sub_* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..e8c4c2d --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,50 @@ +cmake_minimum_required(VERSION 3.8) +project(cuda-fixnum CXX CUDA) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CUDA_STANDARD 11) +set(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin") + +# CUDA +find_package(CUDA REQUIRED) +set(CUDA_ARCH_LIST Auto CACHE LIST + "List of CUDA architectures (e.g. Pascal, Volta, etc) or \ + compute capability versions (6.1, 7.0, etc) to generate code for. \ + Set to Auto for automatic detection (default)." +) +cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCH_LIST}) +string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}") + +string(APPEND CMAKE_CUDA_FLAGS " ${CUDA_ARCH_FLAGS_SPACES}") +string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler -Wall,-Wextra") +set(CMAKE_CUDA_FLAGS_DEBUG "-g -G") + +# source +include_directories(src) + +# bench +add_executable(bench bench/bench.cu) +target_compile_definitions(bench PUBLIC -DNDEBUG) +target_link_libraries(bench -lstdc++) + +# test-suite +find_package(GTest) +if(GTEST_FOUND) + add_executable(check tests/test-suite.cu) + target_link_libraries(check -lstdc++ -L${GTEST_INCLUDE_DIR} -lgtest) + target_compile_options(check PUBLIC -lineinfo) +else() + message(WARNING "GTest libraries not found - not building test-suite") +endif() + +# FIXME: Add test runs +# enable_testing() +# add_test(NAME mytest +# COMMAND mytest) + +# DEBUG CMAKE Variables +#get_cmake_property(_variableNames VARIABLES) +#list (SORT _variableNames) +#foreach (_variableName ${_variableNames}) +# message(STATUS "${_variableName}=${${_variableName}}") +#endforeach() diff --git a/Makefile b/Makefile deleted file mode 100644 index 6136203..0000000 --- a/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -CXX ?= g++ -GENCODES ?= 50 - -INCLUDE_DIRS = -I./src -NVCC_FLAGS = -ccbin $(CXX) -std=c++11 -Xcompiler -Wall,-Wextra -NVCC_OPT_FLAGS = -DNDEBUG -NVCC_TEST_FLAGS = -lineinfo -NVCC_DBG_FLAGS = -g -G -NVCC_LIBS = -lstdc++ -NVCC_TEST_LIBS = -lgtest - -all: - @echo "Please run 'make check' or 'make bench'." - -tests/test-suite: tests/test-suite.cu - nvcc $(NVCC_TEST_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) $(NVCC_TEST_LIBS) -o $@ $< - -check: tests/test-suite - @./tests/test-suite - -bench/bench: bench/bench.cu - nvcc $(NVCC_OPT_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) -o $@ $< - -bench: bench/bench - -.PHONY: clean -clean: - $(RM) tests/test-suite bench/bench diff --git a/README.md b/README.md index b304b17..fe77955 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ The primary use case for fast arithmetic of numbers in the range covered by `cuda-fixnum` is in cryptography and computational number theory; in particular it can form an integral part in accelerating homomorphic encryption primitives as used in privacy-preserving machine learning. As such, special attention is given to support modular arithmetic; this is used in an example implementation of the Paillier additively homomorphic encryption scheme and of elliptic curve scalar multiplication. Future releases will provide additional support for operations useful to implementing Ring-LWE-based somewhat homomorphic encryption schemes. -Finally, the library is designed to be _fast_. Through exploitation of warp-synchronous programming, vote functions, and deferred carry handling, the primitives of the library are currently competitive with the state-of-the-art in the literature for modular multiplication and modular exponentiation on GPUs. The design of the library allows transparent substitution of the underlying arithmetic, allowing the user to select whichever performs best on the available hardware. Moreover, several algorithms, both novel and from the literature, will be incorporated shortly that will improve performance by a further 25-50%. +Finally, the library is designed to be _fast_. Through exploitation of warp-synchronous programming, vote functions, and deferred carry handling, the primitives of the library are currently competitive with the state-of-the-art in the literature for modular multiplication and modular exponentiation on GPUs. The design of the library allows transparent substitution of the underlying arithmetic, allowing the user to select whichever performs best on the available hardware. Moreover, several algorithms, both novel and from the literature, will be incoporated shortly that will improve performance by a further 25-50%. The library is currently at the _alpha_ stage of development. It has many rough edges, but most features are present and it is performant enough to be competitive. Comments, questions and contributions are welcome! @@ -126,14 +126,25 @@ void host_function() { ## Building -The build system for cuda-fixnum is currently, shall we say, _primitive_. Basically you can run `make bench` to build the benchmarking program, or `make check` to build and run the test suite. The test suite requires the [Google Test framework](https://github.com/google/googletest) to be installed. The Makefile will read in the variables `CXX` and `GENCODES` from the environment as a convenient way to specify the C++ compiler to use and the Cuda compute capability codes that you want to compile with. The defaults are `CXX = g++` and `GENCODES = 50`. +``` +mkdir build +cd build +cmake .. +make bench +``` + +The build system for cuda-fixnum uses CMake. Create a working directory to build in and run cmake from there. +Then you can run `make bench` to build the benchmarking program, or `make check` to build and the test suite. +The test suite requires the [Google Test framework](https://github.com/google/googletest) to be installed. + +CMake attempts to detect and set the proper flags for you GPU architecture, but you can override them or cross compile if needed. ## Benchmarks Here is the output from a recent run of the benchmark with a GTX Titan X (Maxwell, 1GHz clock, 3072 cores): ``` -$ bench/bench 5000000 +$ bin/bench 5000000 Function: mul_lo, #elts: 5000e3 fixnum digit total data time Kops/s bits bits (MiB) (seconds)